16#ifndef HIGHWAY_HWY_BASE_H_
17#define HIGHWAY_HWY_BASE_H_
34#define HWY_STR_IMPL(macro) #macro
35#define HWY_STR(macro) HWY_STR_IMPL(macro)
41#define HWY_RESTRICT __restrict
42#define HWY_INLINE __forceinline
43#define HWY_NOINLINE __declspec(noinline)
45#define HWY_NORETURN __declspec(noreturn)
46#define HWY_LIKELY(expr) (expr)
47#define HWY_UNLIKELY(expr) (expr)
48#define HWY_PRAGMA(tokens) __pragma(tokens)
49#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
50#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
51#define HWY_MAYBE_UNUSED
52#define HWY_HAS_ASSUME_ALIGNED 0
54#define HWY_MUST_USE_RESULT _Check_return_
56#define HWY_MUST_USE_RESULT
61#define HWY_RESTRICT __restrict__
62#define HWY_INLINE inline __attribute__((always_inline))
63#define HWY_NOINLINE __attribute__((noinline))
64#define HWY_FLATTEN __attribute__((flatten))
65#define HWY_NORETURN __attribute__((noreturn))
66#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
67#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
68#define HWY_PRAGMA(tokens) _Pragma(#tokens)
69#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
70#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
73#define HWY_MAYBE_UNUSED __attribute__((unused))
74#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
82#if HWY_HAS_ATTRIBUTE(__format__)
83#define HWY_FORMAT(idx_fmt, idx_arg) \
84 __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
86#define HWY_FORMAT(idx_fmt, idx_arg)
94#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
95#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
97#define HWY_ASSUME_ALIGNED(ptr, align) (ptr)
103#if HWY_COMPILER_CLANG
104#define HWY_PUSH_ATTRIBUTES(targets_str) \
105 HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
106 apply_to = function))
107#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
108#elif HWY_COMPILER_GCC
109#define HWY_PUSH_ATTRIBUTES(targets_str) \
110 HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
111#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
113#define HWY_PUSH_ATTRIBUTES(targets_str)
114#define HWY_POP_ATTRIBUTES
120#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
122#define HWY_CONCAT_IMPL(a, b) a##b
123#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
125#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
126#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
132#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
139#define HWY_REP4(literal) literal, literal, literal, literal
141#define HWY_ABORT(format, ...) \
142 ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
145#define HWY_ASSERT(condition) \
147 if (!(condition)) { \
148 HWY_ABORT("Assert %s", #condition); \
152#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
158#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
164#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
173#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
175#define HWY_ATTR_NO_MSAN
179#if !defined(HWY_IS_DEBUG_BUILD)
182#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
183 HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
184#define HWY_IS_DEBUG_BUILD 1
186#define HWY_IS_DEBUG_BUILD 0
190#if HWY_IS_DEBUG_BUILD
191#define HWY_DASSERT(condition) HWY_ASSERT(condition)
193#define HWY_DASSERT(condition) \
205#elif HWY_ARCH_RVV && defined(__riscv_vector)
219#define HWY_ALIGN_MAX alignas(64)
220#elif HWY_ARCH_RVV && defined(__riscv_vector)
221#define HWY_ALIGN_MAX alignas(8)
223#define HWY_ALIGN_MAX alignas(16)
232#if HWY_ARCH_ARM && (__ARM_FP & 2)
233#define HWY_NATIVE_FLOAT16 1
235#define HWY_NATIVE_FLOAT16 0
240#if HWY_NATIVE_FLOAT16
241using float16_t = __fp16;
301template <
bool Condition>
308template <
bool Condition>
311template <
typename T,
typename U>
321template <
typename T,
typename U>
332#define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
333#define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
334#define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
335#define HWY_IF_GE32(T, N) hwy::EnableIf<N * sizeof(T) >= 4>* = nullptr
336#define HWY_IF_GE64(T, N) hwy::EnableIf<N * sizeof(T) >= 8>* = nullptr
337#define HWY_IF_GE128(T, N) hwy::EnableIf<N * sizeof(T) >= 16>* = nullptr
338#define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr
340#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
341#define HWY_IF_SIGNED(T) \
342 hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
343#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
344#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
346#define HWY_IF_LANE_SIZE(T, bytes) \
347 hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
348#define HWY_IF_NOT_LANE_SIZE(T, bytes) \
349 hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
350#define HWY_IF_LANE_SIZE_LT(T, bytes) \
351 hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
353#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
354 hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
530 return IsSame<T, float>() || IsSame<T, double>();
549 static_assert(!IsFloat<T>(),
"Only for integer types");
551 return static_cast<T
>(IsSigned<T>() ? (
static_cast<TU
>(~0ull) >> 1)
552 :
static_cast<TU
>(~0ull));
556 static_assert(!IsFloat<T>(),
"Only for integer types");
557 return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
564 return LimitsMin<T>();
568 return -3.402823466e+38F;
572 return -1.7976931348623158e+308;
577 return LimitsMax<T>();
581 return 3.402823466e+38F;
585 return 1.7976931348623158e+308;
591 static_assert(
sizeof(T) == 0,
"Only instantiate the specializations");
619 return (~(
MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
632 static_assert(
sizeof(T) == 0,
"Only instantiate the specializations");
642 return 4503599627370496.0;
649 return 8 *
sizeof(T) - 1 - MantissaBits<T>();
663template <
typename T1,
typename T2>
665 return (a + b - 1) / b;
669constexpr inline size_t RoundUpTo(
size_t what,
size_t align) {
670 return DivCeil(what, align) * align;
677 _BitScanForward(&index, x);
680 return static_cast<size_t>(__builtin_ctz(x));
688 _BitScanForward64(&index, x);
692 uint32_t lsb =
static_cast<uint32_t
>(x & 0xFFFFFFFF);
695 uint32_t msb =
static_cast<uint32_t
>(x >> 32u);
696 _BitScanForward(&index, msb);
699 _BitScanForward(&index, lsb);
704 return static_cast<size_t>(__builtin_ctzll(x));
712 _BitScanReverse(&index, x);
715 return static_cast<size_t>(__builtin_clz(x));
723 _BitScanReverse64(&index, x);
727 const uint32_t msb =
static_cast<uint32_t
>(x >> 32u);
730 const uint32_t lsb =
static_cast<uint32_t
>(x & 0xFFFFFFFF);
731 _BitScanReverse(&index, lsb);
734 _BitScanReverse(&index, msb);
739 return static_cast<size_t>(__builtin_clzll(x));
744#if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
745 return static_cast<size_t>(__builtin_popcountll(x));
750#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
751 return _mm_popcnt_u64(x);
752#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
753 return _mm_popcnt_u32(
static_cast<uint32_t
>(x & 0xFFFFFFFFu)) +
754 _mm_popcnt_u32(
static_cast<uint32_t
>(x >> 32));
756 x -= ((x >> 1) & 0x5555555555555555ULL);
757 x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
758 x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
762 return static_cast<size_t>(x & 0x7Fu);
769template <
typename TI>
773 :
static_cast<size_t>(
FloorLog2(
static_cast<TI
>(x >> 1)) + 1);
776template <
typename TI>
780 :
static_cast<size_t>(
FloorLog2(
static_cast<TI
>(x - 1)) + 1);
783#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
784#pragma intrinsic(_umul128)
789#if defined(__SIZEOF_INT128__)
790 __uint128_t product = (__uint128_t)a * (__uint128_t)b;
791 *upper = (uint64_t)(product >> 64);
792 return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
793#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
794 return _umul128(a, b, upper);
796 constexpr uint64_t kLo32 = 0xFFFFFFFFU;
797 const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
798 const uint64_t hi_lo = (a >> 32) * (b & kLo32);
799 const uint64_t lo_hi = (a & kLo32) * (b >> 32);
800 const uint64_t hi_hi = (a >> 32) * (b >> 32);
801 const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
802 *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
803 return (t << 32) | (lo_lo & kLo32);
808#pragma intrinsic(memcpy)
809#pragma intrinsic(memset)
813template <
size_t kBytes,
typename From,
typename To>
816 memcpy(to, from, kBytes);
818 __builtin_memcpy(to, from, kBytes);
822template <
size_t kBytes,
typename To>
825 memset(to, 0, kBytes);
827 __builtin_memset(to, 0, kBytes);
832 uint32_t bits = bf.
bits;
835 CopyBytes<4>(&bits, &f);
841 CopyBytes<4>(&f, &bits);
843 bf.
bits =
static_cast<uint16_t
>(bits >> 16);
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_NORETURN
Definition: base.h:65
#define HWY_API
Definition: base.h:120
#define HWY_MAYBE_UNUSED
Definition: base.h:73
#define HWY_DLLEXPORT
Definition: highway_export.h:13
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
constexpr T MantissaEnd()
Definition: base.h:631
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
constexpr MakeSigned< T > MaxExponentTimes2()
Definition: base.h:606
constexpr MakeUnsigned< T > MantissaMask()
Definition: base.h:624
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:831
HWY_API void ZeroBytes(To *to)
Definition: base.h:823
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:839
HWY_API constexpr T LimitsMin()
Definition: base.h:555
typename detail::TypeFromSize< N >::Float FloatFromSize
Definition: base.h:521
HWY_API constexpr T HighestValue()
Definition: base.h:576
typename RemoveConstT< T >::type RemoveConst
Definition: base.h:370
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
constexpr float HighestValue< float >()
Definition: base.h:580
typename detail::TypeFromSize< N >::Signed SignedFromSize
Definition: base.h:519
constexpr T1 DivCeil(T1 a, T2 b)
Definition: base.h:664
constexpr float MantissaEnd< float >()
Definition: base.h:636
double float64_t
Definition: base.h:258
HWY_API constexpr bool IsSame()
Definition: base.h:322
constexpr bool IsSigned< bfloat16_t >()
Definition: base.h:542
HWY_API constexpr bool IsSigned()
Definition: base.h:534
constexpr size_t FloorLog2(TI x)
Definition: base.h:770
constexpr MakeUnsigned< T > ExponentMask()
Definition: base.h:618
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:709
constexpr bool IsSigned< float16_t >()
Definition: base.h:538
constexpr double HighestValue< double >()
Definition: base.h:584
constexpr int MantissaBits< double >()
Definition: base.h:599
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:309
static HWY_MAYBE_UNUSED bool operator>(const uint128_t &a, const uint128_t &b)
Definition: base.h:283
float float32_t
Definition: base.h:257
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
constexpr double MantissaEnd< double >()
Definition: base.h:640
constexpr int MantissaBits()
Definition: base.h:590
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:674
constexpr float LowestValue< float >()
Definition: base.h:567
constexpr MakeSigned< T > MaxExponentField()
Definition: base.h:656
constexpr size_t CeilLog2(TI x)
Definition: base.h:777
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:719
constexpr MakeUnsigned< T > SignMask()
Definition: base.h:612
constexpr double LowestValue< double >()
Definition: base.h:571
static HWY_MAYBE_UNUSED bool operator<(const uint128_t &a, const uint128_t &b)
Definition: base.h:278
HWY_API constexpr T LowestValue()
Definition: base.h:563
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize
Definition: base.h:209
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:511
typename detail::Relations< T >::Float MakeFloat
Definition: base.h:507
HWY_API constexpr bool IsFloat()
Definition: base.h:527
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
constexpr int MantissaBits< float >()
Definition: base.h:595
HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) Abort(const char *file
HWY_DLLEXPORT HWY_NORETURN void int const char * format
Definition: base.h:848
HWY_DLLEXPORT HWY_NORETURN void int line
Definition: base.h:848
HWY_API constexpr T LimitsMax()
Definition: base.h:548
constexpr size_t RoundUpTo(size_t what, size_t align)
Definition: base.h:669
typename detail::Relations< T >::Narrow MakeNarrow
Definition: base.h:513
constexpr int ExponentBits()
Definition: base.h:647
void type
Definition: base.h:305
@ value
Definition: base.h:313
uint64_t value
Definition: base.h:272
uint64_t key
Definition: base.h:273
T type
Definition: base.h:366
T type
Definition: base.h:362
uint16_t bits
Definition: base.h:252
int16_t Signed
Definition: base.h:451
float Wide
Definition: base.h:452
uint16_t Unsigned
Definition: base.h:450
double Float
Definition: base.h:466
uint64_t Unsigned
Definition: base.h:464
int64_t Signed
Definition: base.h:465
float Narrow
Definition: base.h:467
int16_t Signed
Definition: base.h:444
float Wide
Definition: base.h:446
uint16_t Unsigned
Definition: base.h:443
uint32_t Unsigned
Definition: base.h:456
double Wide
Definition: base.h:459
float Float
Definition: base.h:458
int32_t Signed
Definition: base.h:457
uint16_t Unsigned
Definition: base.h:400
int16_t Signed
Definition: base.h:401
int32_t Wide
Definition: base.h:402
int8_t Narrow
Definition: base.h:403
uint32_t Unsigned
Definition: base.h:415
int64_t Wide
Definition: base.h:418
float Float
Definition: base.h:417
int16_t Narrow
Definition: base.h:419
int32_t Signed
Definition: base.h:416
int32_t Narrow
Definition: base.h:434
double Float
Definition: base.h:433
uint64_t Unsigned
Definition: base.h:431
int64_t Signed
Definition: base.h:432
int16_t Wide
Definition: base.h:389
int8_t Signed
Definition: base.h:388
uint8_t Unsigned
Definition: base.h:387
uint64_t Narrow
Definition: base.h:439
uint8_t Narrow
Definition: base.h:396
int16_t Signed
Definition: base.h:394
uint32_t Wide
Definition: base.h:395
uint16_t Unsigned
Definition: base.h:393
uint32_t Unsigned
Definition: base.h:407
uint64_t Wide
Definition: base.h:410
uint16_t Narrow
Definition: base.h:411
float Float
Definition: base.h:409
int32_t Signed
Definition: base.h:408
uint32_t Narrow
Definition: base.h:427
int64_t Signed
Definition: base.h:424
uint64_t Unsigned
Definition: base.h:423
double Float
Definition: base.h:425
int8_t Signed
Definition: base.h:382
uint8_t Unsigned
Definition: base.h:381
uint16_t Wide
Definition: base.h:383
int8_t Signed
Definition: base.h:475
uint8_t Unsigned
Definition: base.h:474
int16_t Signed
Definition: base.h:480
uint16_t Unsigned
Definition: base.h:479
int32_t Signed
Definition: base.h:485
uint32_t Unsigned
Definition: base.h:484
float Float
Definition: base.h:486
double Float
Definition: base.h:492
int64_t Signed
Definition: base.h:491
uint64_t Unsigned
Definition: base.h:490
uint16_t bits
Definition: base.h:247
uint64_t lo
Definition: base.h:265
uint64_t hi
Definition: base.h:266