33template <
typename T,
size_t N = 16 /
sizeof(T)>
40 return *
this = (*
this * other);
43 return *
this = (*
this / other);
46 return *
this = (*
this + other);
49 return *
this = (*
this - other);
52 return *
this = (*
this & other);
55 return *
this = (*
this | other);
58 return *
this = (*
this ^ other);
66 T raw[16 /
sizeof(T)] = {};
70template <
typename T,
size_t N = 16 /
sizeof(T)>
74 return b ?
static_cast<Raw>(
~Raw{0}) : 0;
78 Raw bits[16 /
sizeof(T)] = {};
85 template <
typename T,
size_t N>
94using DFromV =
decltype(detail::Deduce128()(V()));
97using TFromV = TFromD<DFromV<V>>;
101template <
typename T,
size_t N,
typename FromT,
size_t FromN>
104 static_assert(
sizeof(T) *
N ==
sizeof(FromT) * FromN,
105 "Casting does not change size");
106 CopyBytes<sizeof(T) * N>(
v.raw, to.
raw);
112template <
typename T,
size_t N>
115 ZeroBytes<sizeof(T) * N>(
v.raw);
122template <
typename T,
size_t N,
typename T2>
125 for (
size_t i = 0; i <
N; ++i) {
126 v.raw[i] =
static_cast<T
>(t);
131template <
typename T,
size_t N>
138template <
typename T, HWY_IF_FLOAT(T)>
143template <
typename T, HWY_IF_NOT_FLOAT(T)>
146 return static_cast<T
>(
static_cast<TU
>(
static_cast<TU
>(t) + TU{1}) &
147 hwy::LimitsMax<TU>());
152template <
typename T,
size_t N,
typename T2>
155 T counter =
static_cast<T
>(first);
156 for (
size_t i = 0; i <
N; ++i) {
166template <
typename T,
size_t N>
170 using TU =
TFromD<
decltype(du)>;
172 for (
size_t i = 0; i <
N; ++i) {
173 vu.raw[i] =
static_cast<TU
>(~vu.raw[i]);
179template <
typename T,
size_t N>
185 for (
size_t i = 0; i <
N; ++i) {
186 au.raw[i] &= bu.raw[i];
190template <
typename T,
size_t N>
196template <
typename T,
size_t N>
202template <
typename T,
size_t N>
208 for (
size_t i = 0; i <
N; ++i) {
209 au.raw[i] |= bu.raw[i];
213template <
typename T,
size_t N>
219template <
typename T,
size_t N>
225 for (
size_t i = 0; i <
N; ++i) {
226 au.raw[i] ^= bu.raw[i];
230template <
typename T,
size_t N>
237template <
typename T,
size_t N>
238HWY_API Vec128<T, N>
Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
239 return Or(o1,
Or(o2, o3));
243template <
typename T,
size_t N>
244HWY_API Vec128<T, N>
OrAnd(
const Vec128<T, N> o,
const Vec128<T, N> a1,
245 const Vec128<T, N> a2) {
246 return Or(o,
And(a1, a2));
250template <
typename T,
size_t N>
257template <
typename T,
size_t N>
259 const Vec128<T, N> sign) {
260 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
261 const auto msb =
SignBit(Simd<T, N, 0>());
265template <
typename T,
size_t N>
267 const Vec128<T, N> sign) {
268 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
273template <
typename T,
size_t N>
276 for (
size_t i = 0; i <
N; ++i) {
277 v.raw[i] =
v.raw[i] < 0 ? T(-1) : T(0);
284template <
typename TFrom,
typename TTo,
size_t N>
286 Mask128<TFrom, N> mask) {
288 static_assert(
sizeof(TTo) *
N ==
sizeof(TFrom) *
N,
"Must have same size");
289 CopyBytes<sizeof(TTo) * N>(mask.bits, to.bits);
294template <
typename T,
size_t N>
297 static_assert(
sizeof(
v) ==
sizeof(mask),
"Must have same size");
298 CopyBytes<sizeof(T) * N>(
v.raw, mask.bits);
302template <
typename T,
size_t N>
305 CopyBytes<sizeof(T) * N>(mask.
bits,
v.raw);
309template <
typename T,
size_t N>
314template <
typename T,
size_t N>
317 for (
size_t i = 0; i <
N; ++i) {
324template <
typename T,
size_t N>
330template <
typename T,
size_t N>
332 const Vec128<T, N> yes) {
336template <
typename T,
size_t N>
338 const Vec128<T, N> no) {
342template <
typename T,
size_t N>
345 for (
size_t i = 0; i <
N; ++i) {
346 v.raw[i] =
v.raw[i] < 0 ? yes.raw[i] : no.raw[i];
351template <
typename T,
size_t N>
358template <
typename T,
size_t N>
359HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
363template <
typename T,
size_t N>
364HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
365 const Simd<T, N, 0>
d;
369template <
typename T,
size_t N>
370HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
371 const Simd<T, N, 0>
d;
375template <
typename T,
size_t N>
376HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
377 const Simd<T, N, 0>
d;
381template <
typename T,
size_t N>
382HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
383 const Simd<T, N, 0>
d;
391template <
int kBits,
typename T,
size_t N>
393 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
394 for (
size_t i = 0; i <
N; ++i) {
396 v.raw[i] =
static_cast<T
>(shifted);
401template <
int kBits,
typename T,
size_t N>
403 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
404#if __cplusplus >= 202002L
407 for (
size_t i = 0; i <
N; ++i) {
408 v.raw[i] =
static_cast<T
>(
v.raw[i] >> kBits);
415 for (
size_t i = 0; i <
N; ++i) {
416 const TU shifted =
static_cast<TU
>(
static_cast<TU
>(
v.raw[i]) >> kBits);
417 const TU sign =
v.raw[i] < 0 ?
static_cast<TU
>(~TU{0}) : 0;
418 const size_t sign_shift =
419 static_cast<size_t>(
static_cast<int>(
sizeof(TU)) * 8 - 1 - kBits);
420 const TU upper =
static_cast<TU
>(sign << sign_shift);
421 v.raw[i] =
static_cast<T
>(shifted | upper);
424 for (
size_t i = 0; i <
N; ++i) {
425 v.raw[i] =
static_cast<T
>(
v.raw[i] >> kBits);
439 template <
typename T,
size_t N>
441 return Or(ShiftRight<kBits>(
v),
ShiftLeft<
sizeof(T) * 8 - kBits>(
v));
447 template <
typename T,
size_t N>
455template <
int kBits,
typename T,
size_t N>
457 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
463template <
typename T,
size_t N>
465 for (
size_t i = 0; i <
N; ++i) {
467 v.raw[i] =
static_cast<T
>(shifted);
472template <
typename T,
size_t N>
474#if __cplusplus >= 202002L
477 for (
size_t i = 0; i <
N; ++i) {
478 v.raw[i] =
static_cast<T
>(
v.raw[i] >> bits);
485 for (
size_t i = 0; i <
N; ++i) {
486 const TU shifted =
static_cast<TU
>(
static_cast<TU
>(
v.raw[i]) >> bits);
487 const TU sign =
v.raw[i] < 0 ?
static_cast<TU
>(~TU{0}) : 0;
488 const size_t sign_shift =
489 static_cast<size_t>(
static_cast<int>(
sizeof(TU)) * 8 - 1 - bits);
490 const TU upper =
static_cast<TU
>(sign << sign_shift);
491 v.raw[i] =
static_cast<T
>(shifted | upper);
494 for (
size_t i = 0; i <
N; ++i) {
495 v.raw[i] =
static_cast<T
>(
v.raw[i] >> bits);
504template <
typename T,
size_t N>
506 for (
size_t i = 0; i <
N; ++i) {
509 v.raw[i] =
static_cast<T
>(shifted);
514template <
typename T,
size_t N>
516#if __cplusplus >= 202002L
519 for (
size_t i = 0; i <
N; ++i) {
520 v.raw[i] =
static_cast<T
>(
v.raw[i] >> bits.
raw[i]);
527 for (
size_t i = 0; i <
N; ++i) {
529 static_cast<TU
>(
static_cast<TU
>(
v.raw[i]) >> bits.
raw[i]);
530 const TU sign =
v.raw[i] < 0 ?
static_cast<TU
>(~TU{0}) : 0;
531 const size_t sign_shift =
static_cast<size_t>(
532 static_cast<int>(
sizeof(TU)) * 8 - 1 - bits.
raw[i]);
533 const TU upper =
static_cast<TU
>(sign << sign_shift);
534 v.raw[i] =
static_cast<T
>(shifted | upper);
537 for (
size_t i = 0; i <
N; ++i) {
538 v.raw[i] =
static_cast<T
>(
v.raw[i] >> bits.
raw[i]);
547template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
549 for (
size_t i = 0; i <
N; ++i) {
550 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw[i]);
551 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw[i]);
552 a.
raw[i] =
static_cast<T
>((a64 + b64) &
static_cast<uint64_t
>(~T(0)));
556template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
558 for (
size_t i = 0; i <
N; ++i) {
559 a.raw[i] += b.raw[i];
564template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
566 for (
size_t i = 0; i <
N; ++i) {
567 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw[i]);
568 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw[i]);
569 a.
raw[i] =
static_cast<T
>((a64 - b64) &
static_cast<uint64_t
>(~T(0)));
573template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
575 for (
size_t i = 0; i <
N; ++i) {
576 a.raw[i] -= b.raw[i];
585 Vec128<uint64_t, (
N + 7) / 8> sums;
586 for (
size_t i = 0; i <
N; ++i) {
587 sums.
raw[i / 8] +=
v.raw[i];
593template <
typename T,
size_t N>
595 for (
size_t i = 0; i <
N; ++i) {
596 a.
raw[i] =
static_cast<T
>(
598 hwy::HighestValue<T>()));
604template <
typename T,
size_t N>
606 for (
size_t i = 0; i <
N; ++i) {
607 a.
raw[i] =
static_cast<T
>(
609 hwy::HighestValue<T>()));
615template <
typename T,
size_t N, HWY_IF_UNSIGNED(T)>
617 for (
size_t i = 0; i <
N; ++i) {
618 a.
raw[i] =
static_cast<T
>((a.
raw[i] + b.
raw[i] + 1) / 2);
625template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
627 for (
size_t i = 0; i <
N; ++i) {
628 const T s = a.
raw[i];
629 const T min = hwy::LimitsMin<T>();
630 a.
raw[i] =
static_cast<T
>((s >= 0 || s == min) ? a.
raw[i] : -s);
634template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
636 for (
size_t i = 0; i <
N; ++i) {
637 v.raw[i] = std::abs(
v.raw[i]);
644template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
646 for (
size_t i = 0; i <
N; ++i) {
652template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
653HWY_API Vec128<T, N>
Min(Vec128<T, N> a,
const Vec128<T, N> b) {
654 for (
size_t i = 0; i <
N; ++i) {
655 if (std::isnan(a.raw[i])) {
657 }
else if (std::isnan(b.raw[i])) {
660 a.raw[i] =
HWY_MIN(a.raw[i], b.raw[i]);
666template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
668 for (
size_t i = 0; i <
N; ++i) {
674template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
675HWY_API Vec128<T, N>
Max(Vec128<T, N> a,
const Vec128<T, N> b) {
676 for (
size_t i = 0; i <
N; ++i) {
677 if (std::isnan(a.raw[i])) {
679 }
else if (std::isnan(b.raw[i])) {
682 a.raw[i] =
HWY_MAX(a.raw[i], b.raw[i]);
690template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
695template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
697 return Zero(Simd<T, N, 0>()) -
v;
702template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
704 for (
size_t i = 0; i <
N; ++i) {
710template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
712 for (
size_t i = 0; i <
N; ++i) {
713 a.raw[i] =
static_cast<T
>(
static_cast<int64_t
>(a.raw[i]) * b.raw[i]);
718template <
typename T,
size_t N, HWY_IF_UNSIGNED(T)>
720 for (
size_t i = 0; i <
N; ++i) {
721 a.raw[i] =
static_cast<T
>(
static_cast<uint64_t
>(a.raw[i]) * b.raw[i]);
726template <
typename T,
size_t N>
728 for (
size_t i = 0; i <
N; ++i) {
738 for (
size_t i = 0; i <
N; ++i) {
739 a.
raw[i] =
static_cast<int16_t
>((int32_t{a.
raw[i]} * b.
raw[i]) >> 16);
746 for (
size_t i = 0; i <
N; ++i) {
750 a.
raw[i] =
static_cast<uint16_t
>(
751 (
static_cast<uint32_t
>(a.
raw[i]) *
static_cast<uint32_t
>(b.
raw[i])) >>
760 for (
size_t i = 0; i <
N; ++i) {
761 a.
raw[i] =
static_cast<int16_t
>((2 * a.
raw[i] * b.
raw[i] + 32768) >> 16);
768HWY_API Vec128<int64_t, (
N + 1) / 2>
MulEven(
const Vec128<int32_t, N> a,
769 const Vec128<int32_t, N> b) {
770 Vec128<int64_t, (
N + 1) / 2> mul;
771 for (
size_t i = 0; i <
N; i += 2) {
772 const int64_t a64 = a.raw[i];
773 mul.raw[i / 2] = a64 * b.raw[i];
779 const Vec128<uint32_t, N> b) {
780 Vec128<uint64_t, (
N + 1) / 2> mul;
781 for (
size_t i = 0; i <
N; i += 2) {
782 const uint64_t a64 = a.raw[i];
783 mul.raw[i / 2] = a64 * b.raw[i];
791 Vec128<int64_t, (
N + 1) / 2> mul;
792 for (
size_t i = 0; i <
N; i += 2) {
793 const int64_t a64 = a.
raw[i + 1];
794 mul.raw[i / 2] = a64 * b.
raw[i + 1];
801 Vec128<uint64_t, (
N + 1) / 2> mul;
802 for (
size_t i = 0; i <
N; i += 2) {
803 const uint64_t a64 = a.
raw[i + 1];
804 mul.raw[i / 2] = a64 * b.
raw[i + 1];
811 for (
size_t i = 0; i <
N; ++i) {
815 v.raw[i] = (std::abs(
v.raw[i]) == 0.0f) ? 0.0f : 1.0f /
v.raw[i];
827template <
typename T,
size_t N>
830 return mul * x + add;
833template <
typename T,
size_t N>
836 return add - mul * x;
839template <
typename T,
size_t N>
842 return mul * x - sub;
845template <
typename T,
size_t N>
848 return Neg(mul) * x - sub;
855 for (
size_t i = 0; i <
N; ++i) {
856 const float half =
v.raw[i] * 0.5f;
858 CopyBytes<4>(&
v.raw[i], &bits);
860 bits = 0x5F3759DF - (bits >> 1);
861 CopyBytes<4>(&bits, &
v.raw[i]);
863 v.raw[i] =
v.raw[i] * (1.5f - (half *
v.raw[i] *
v.raw[i]));
868template <
typename T,
size_t N>
870 for (
size_t i = 0; i <
N; ++i) {
871 v.raw[i] = std::sqrt(
v.raw[i]);
878template <
typename T,
size_t N>
882 for (
size_t i = 0; i <
N; ++i) {
883 if (!(a.
raw[i] < MantissaEnd<T>())) {
886 const T bias =
v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
887 const TI rounded =
static_cast<TI
>(
v.raw[i] + bias);
889 v.raw[i] =
v.raw[i] < 0 ? T{-0} : T{0};
892 const T rounded_f =
static_cast<T
>(rounded);
894 if ((rounded & 1) && std::abs(rounded_f -
v.raw[i]) == T(0.5)) {
895 v.raw[i] =
static_cast<T
>(rounded - (
v.raw[i] < T(0) ? -1 : 1));
898 v.raw[i] = rounded_f;
909 const Vec128<float, N> abs =
Abs(
v);
910 Vec128<int32_t, N> ret;
911 for (
size_t i = 0; i <
N; ++i) {
912 const bool signbit = std::signbit(
v.raw[i]);
914 if (!(abs.raw[i] < MantissaEnd<T>())) {
916 if (!(abs.raw[i] <=
static_cast<T
>(LimitsMax<TI>()))) {
917 ret.raw[i] = signbit ? LimitsMin<TI>() :
LimitsMax<TI>();
920 ret.raw[i] =
static_cast<TI
>(
v.raw[i]);
923 const T bias =
v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
924 const TI rounded =
static_cast<TI
>(
v.raw[i] + bias);
929 const T rounded_f =
static_cast<T
>(rounded);
931 if ((rounded & 1) && std::abs(rounded_f -
v.raw[i]) == T(0.5)) {
932 ret.raw[i] = rounded - (signbit ? -1 : 1);
935 ret.raw[i] = rounded;
940template <
typename T,
size_t N>
944 for (
size_t i = 0; i <
N; ++i) {
945 if (!(abs.
raw[i] <= MantissaEnd<T>())) {
948 const TI truncated =
static_cast<TI
>(
v.raw[i]);
949 if (truncated == 0) {
950 v.raw[i] =
v.raw[i] < 0 ? -T{0} : T{0};
953 v.raw[i] =
static_cast<T
>(truncated);
959template <
typename Float,
size_t N>
961 constexpr int kMantissaBits = MantissaBits<Float>();
963 const Bits kExponentMask = MaxExponentField<Float>();
964 const Bits kMantissaMask = MantissaMask<Float>();
965 const Bits kBias = kExponentMask / 2;
967 for (
size_t i = 0; i <
N; ++i) {
968 const bool positive =
v.raw[i] > Float(0.0);
971 CopyBytes<sizeof(Bits)>(&
v.raw[i], &bits);
974 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
976 if (exponent >= kMantissaBits)
continue;
979 v.raw[i] = positive ? Float{1} : Float{-0.0};
983 const Bits mantissa_mask = kMantissaMask >> exponent;
985 if ((bits & mantissa_mask) == 0)
continue;
988 if (positive) bits += (kMantissaMask + 1) >> exponent;
989 bits &= ~mantissa_mask;
991 CopyBytes<sizeof(Bits)>(&bits, &
v.raw[i]);
997template <
typename Float,
size_t N>
999 constexpr int kMantissaBits = MantissaBits<Float>();
1001 const Bits kExponentMask = MaxExponentField<Float>();
1002 const Bits kMantissaMask = MantissaMask<Float>();
1003 const Bits kBias = kExponentMask / 2;
1005 for (
size_t i = 0; i <
N; ++i) {
1006 const bool negative =
v.raw[i] < Float(0.0);
1009 CopyBytes<sizeof(Bits)>(&
v.raw[i], &bits);
1011 const int exponent =
1012 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
1014 if (exponent >= kMantissaBits)
continue;
1017 v.raw[i] = negative ? Float(-1.0) : Float(0.0);
1021 const Bits mantissa_mask = kMantissaMask >> exponent;
1023 if ((bits & mantissa_mask) == 0)
continue;
1026 if (negative) bits += (kMantissaMask + 1) >> exponent;
1027 bits &= ~mantissa_mask;
1029 CopyBytes<sizeof(Bits)>(&bits, &
v.raw[i]);
1036template <
typename T,
size_t N>
1039 for (
size_t i = 0; i <
N; ++i) {
1042 memcpy(&bits, &
v.raw[i],
sizeof(T));
1051template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1053 const Simd<T, N, 0>
d;
1061template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1063 const Simd<T, N, 0>
d;
1066 using VI =
VFromD<
decltype(di)>;
1067 using VU =
VFromD<
decltype(du)>;
1079template <
typename T,
size_t N>
1082 for (
size_t i = 0; i <
N; ++i) {
1088template <
typename T,
size_t N>
1091 for (
size_t i = 0; i <
N; ++i) {
1097template <
typename T,
size_t N>
1099 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1100 return (
v & bit) == bit;
1103template <
typename T,
size_t N>
1106 for (
size_t i = 0; i <
N; ++i) {
1111template <
typename T,
size_t N>
1112HWY_API Mask128<T, N>
operator>(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1114 for (
size_t i = 0; i <
N; ++i) {
1120template <
typename T,
size_t N>
1123 for (
size_t i = 0; i <
N; ++i) {
1128template <
typename T,
size_t N>
1131 for (
size_t i = 0; i <
N; ++i) {
1152 const bool lt = a.
raw[1] < b.
raw[1];
1160template <
class D,
class V = VFromD<D>>
1165template <
class D,
class V = VFromD<D>>
1170template <
class D,
class V = VFromD<D>>
1175template <
class D,
class V = VFromD<D>>
1184template <
typename T,
size_t N>
1188 CopyBytes<sizeof(T) * N>(aligned,
v.raw);
1192template <
typename T,
size_t N>
1198template <
typename T,
size_t N>
1204template <
typename T,
size_t N>
1207 return Load(
d, aligned);
1212template <
typename T,
size_t N>
1215 CopyBytes<sizeof(T) * N>(
v.raw, aligned);
1218template <
typename T,
size_t N>
1223template <
typename T,
size_t N>
1226 for (
size_t i = 0; i <
N; ++i) {
1227 if (m.bits[i]) p[i] =
v.raw[i];
1236#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1237#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1239#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1242template <
typename T,
size_t N>
1245 alignas(16) T buf0[
N];
1246 alignas(16) T buf1[
N];
1247 for (
size_t i = 0; i <
N; ++i) {
1248 buf0[i] = *unaligned++;
1249 buf1[i] = *unaligned++;
1255template <
typename T,
size_t N>
1259 alignas(16) T buf0[
N];
1260 alignas(16) T buf1[
N];
1261 alignas(16) T buf2[
N];
1262 for (
size_t i = 0; i <
N; ++i) {
1263 buf0[i] = *unaligned++;
1264 buf1[i] = *unaligned++;
1265 buf2[i] = *unaligned++;
1272template <
typename T,
size_t N>
1276 alignas(16) T buf0[
N];
1277 alignas(16) T buf1[
N];
1278 alignas(16) T buf2[
N];
1279 alignas(16) T buf3[
N];
1280 for (
size_t i = 0; i <
N; ++i) {
1281 buf0[i] = *unaligned++;
1282 buf1[i] = *unaligned++;
1283 buf2[i] = *unaligned++;
1284 buf3[i] = *unaligned++;
1294template <
typename T,
size_t N>
1298 for (
size_t i = 0; i <
N; ++i) {
1299 *unaligned++ = v0.
raw[i];
1300 *unaligned++ = v1.
raw[i];
1304template <
typename T,
size_t N>
1308 for (
size_t i = 0; i <
N; ++i) {
1309 *unaligned++ = v0.
raw[i];
1310 *unaligned++ = v1.
raw[i];
1311 *unaligned++ = v2.
raw[i];
1315template <
typename T,
size_t N>
1320 for (
size_t i = 0; i <
N; ++i) {
1321 *unaligned++ = v0.
raw[i];
1322 *unaligned++ = v1.
raw[i];
1323 *unaligned++ = v2.
raw[i];
1324 *unaligned++ = v3.
raw[i];
1330template <
typename T,
size_t N>
1338template <
typename T,
size_t N,
typename Offset>
1341 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1342 for (
size_t i = 0; i <
N; ++i) {
1343 uint8_t*
const base8 =
reinterpret_cast<uint8_t*
>(base) + offset.
raw[i];
1348template <
typename T,
size_t N,
typename Index>
1351 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1352 for (
size_t i = 0; i <
N; ++i) {
1353 base[index.
raw[i]] =
v.raw[i];
1359template <
typename T,
size_t N,
typename Offset>
1362 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1364 for (
size_t i = 0; i <
N; ++i) {
1365 const uint8_t* base8 =
1366 reinterpret_cast<const uint8_t*
>(base) + offset.
raw[i];
1372template <
typename T,
size_t N,
typename Index>
1375 const Vec128<Index, N> index) {
1376 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1378 for (
size_t i = 0; i <
N; ++i) {
1379 v.raw[i] = base[index.raw[i]];
1389template <
typename FromT,
typename ToT,
size_t N>
1392 static_assert(
sizeof(ToT) >
sizeof(FromT),
"Not promoting");
1394 for (
size_t i = 0; i <
N; ++i) {
1396 ret.
raw[i] =
static_cast<ToT
>(from.
raw[i]);
1407 for (
size_t i = 0; i <
N; ++i) {
1409 if (std::isinf(from.
raw[i]) ||
1415 ret.
raw[i] =
static_cast<float>(from.
raw[i]);
1423 for (
size_t i = 0; i <
N; ++i) {
1425 if (std::isinf(from.
raw[i]) ||
1426 std::fabs(from.
raw[i]) >
static_cast<double>(HighestValue<int32_t>())) {
1427 ret.
raw[i] = std::signbit(from.
raw[i]) ? LowestValue<int32_t>()
1428 : HighestValue<int32_t>();
1431 ret.
raw[i] =
static_cast<int32_t
>(from.
raw[i]);
1436template <
typename FromT,
typename ToT,
size_t N>
1439 static_assert(!IsFloat<FromT>(),
"FromT=double are handled above");
1440 static_assert(
sizeof(ToT) <
sizeof(FromT),
"Not demoting");
1443 for (
size_t i = 0; i <
N; ++i) {
1447 ret.
raw[i] =
static_cast<ToT
>(from.
raw[i]);
1454 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
1455 const Repartition<uint32_t,
decltype(dbf16)> du32;
1456 const Vec128<uint32_t, N> b_in_lower = ShiftRight<16>(
BitCast(du32, b));
1458 const Vec128<uint32_t, N> a_mask =
Set(du32, 0xFFFF0000);
1466#if HWY_NATIVE_FLOAT16
1467 CopyBytes<2>(&val, to);
1474#if HWY_NATIVE_FLOAT16
1476 CopyBytes<2>(from, &bits16);
1487 const Vec128<float16_t, N>
v) {
1488 Vec128<float, N> ret;
1489 for (
size_t i = 0; i <
N; ++i) {
1491 const uint32_t sign =
static_cast<uint32_t
>(bits16 >> 15);
1492 const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1493 const uint32_t mantissa = bits16 & 0x3FF;
1496 if (biased_exp == 0) {
1497 const float subnormal =
1498 (1.0f / 16384) * (
static_cast<float>(mantissa) * (1.0f / 1024));
1499 ret.raw[i] = sign ? -subnormal : subnormal;
1505 const uint32_t biased_exp32 = biased_exp + (127 - 15);
1506 const uint32_t mantissa32 = mantissa << (23 - 10);
1507 const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1508 CopyBytes<4>(&bits32, &ret.raw[i]);
1515 const Vec128<bfloat16_t, N>
v) {
1516 Vec128<float, N> ret;
1517 for (
size_t i = 0; i <
N; ++i) {
1525 const Vec128<float, N>
v) {
1526 Vec128<float16_t, N> ret;
1527 for (
size_t i = 0; i <
N; ++i) {
1529 CopyBytes<4>(&
v.raw[i], &bits32);
1530 const uint32_t sign = bits32 >> 31;
1531 const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1532 const uint32_t mantissa32 = bits32 & 0x7FFFFF;
1534 const int32_t exp =
HWY_MIN(
static_cast<int32_t
>(biased_exp32) - 127, 15);
1538 ZeroBytes<sizeof(uint16_t)>(&ret.raw[i]);
1542 uint32_t biased_exp16, mantissa16;
1547 const uint32_t sub_exp =
static_cast<uint32_t
>(-14 - exp);
1549 mantissa16 =
static_cast<uint32_t
>((1u << (10 - sub_exp)) +
1550 (mantissa32 >> (13 + sub_exp)));
1553 biased_exp16 =
static_cast<uint32_t
>(exp + 15);
1554 HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1555 mantissa16 = mantissa32 >> 13;
1559 const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1561 const uint16_t narrowed =
static_cast<uint16_t
>(bits16);
1569 const Vec128<float, N>
v) {
1570 Vec128<bfloat16_t, N> ret;
1571 for (
size_t i = 0; i <
N; ++i) {
1577template <
typename FromT,
typename ToT,
size_t N, HWY_IF_FLOAT(FromT)>
1580 static_assert(
sizeof(ToT) ==
sizeof(FromT),
"Should have same size");
1582 for (
size_t i = 0; i <
N; ++i) {
1585 const double f =
static_cast<double>(from.
raw[i]);
1586 if (std::isinf(from.
raw[i]) ||
1587 std::fabs(f) >
static_cast<double>(LimitsMax<ToT>())) {
1589 std::signbit(from.
raw[i]) ? LimitsMin<ToT>() : LimitsMax<ToT>();
1592 ret.
raw[i] =
static_cast<ToT
>(from.
raw[i]);
1597template <
typename FromT,
typename ToT,
size_t N, HWY_IF_NOT_FLOAT(FromT)>
1599 Vec128<FromT, N> from) {
1600 static_assert(
sizeof(ToT) ==
sizeof(FromT),
"Should have same size");
1602 for (
size_t i = 0; i <
N; ++i) {
1604 ret.raw[i] =
static_cast<ToT
>(from.raw[i]);
1616template <
typename T,
size_t N>
1623template <
typename T,
size_t N>
1629template <
typename T,
size_t N>
1637template <
typename T,
size_t N>
1639 Vec128<T, N / 2>
v) {
1645template <
typename T,
size_t N>
1654template <
typename T,
size_t N>
1663template <
typename T,
size_t N>
1672template <
typename T,
size_t N>
1682template <
typename T,
size_t N>
1686 CopyBytes<
N / 2 *
sizeof(T)>(lo.raw, &ret.raw[0]);
1687 CopyBytes<
N / 2 *
sizeof(T)>(&hi.raw[
N / 2], &ret.raw[
N / 2]);
1691template <
typename T,
size_t N>
1695 for (
size_t i = 0; i <
N / 2; ++i) {
1696 ret.
raw[i] = lo.
raw[2 * i];
1698 for (
size_t i = 0; i <
N / 2; ++i) {
1699 ret.
raw[
N / 2 + i] = hi.
raw[2 * i];
1704template <
typename T,
size_t N>
1708 for (
size_t i = 0; i <
N / 2; ++i) {
1709 ret.
raw[i] = lo.
raw[2 * i + 1];
1711 for (
size_t i = 0; i <
N / 2; ++i) {
1712 ret.
raw[
N / 2 + i] = hi.
raw[2 * i + 1];
1719template <
int kBytes,
typename T,
size_t N,
class V = Vec128<T, N>>
1723 reinterpret_cast<const uint8_t *
HWY_RESTRICT>(lo.raw);
1726 CopyBytes<
sizeof(T) *
N - kBytes>(lo8 + kBytes, ret8);
1727 CopyBytes<kBytes>(hi.raw, ret8 +
sizeof(T) *
N - kBytes);
1733template <
int kBytes,
typename T,
size_t N>
1735 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1739 ZeroBytes<kBytes>(ret8);
1740 CopyBytes<
sizeof(T) *
N - kBytes>(
v.raw, ret8 + kBytes);
1744template <
int kBytes,
typename T,
size_t N>
1746 return ShiftLeftBytes<kBytes>(
DFromV<
decltype(
v)>(),
v);
1751template <
int kLanes,
typename T,
size_t N>
1757template <
int kLanes,
typename T,
size_t N>
1759 return ShiftLeftLanes<kLanes>(
DFromV<
decltype(
v)>(),
v);
1763template <
int kBytes,
typename T,
size_t N>
1765 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1771 CopyBytes<
sizeof(T) *
N - kBytes>(v8 + kBytes, ret8);
1772 ZeroBytes<kBytes>(ret8 +
sizeof(T) *
N - kBytes);
1777template <
int kLanes,
typename T,
size_t N>
1785template <
typename T,
size_t N>
1790template <
typename T,
size_t N>
1796template <
typename T,
size_t N>
1801template <
typename T,
size_t N>
1803 for (
size_t i = 0; i <
N; i += 2) {
1804 v.raw[i + 1] =
v.raw[i];
1809template <
typename T,
size_t N>
1811 for (
size_t i = 0; i <
N; i += 2) {
1812 v.raw[i] =
v.raw[i + 1];
1817template <
typename T,
size_t N>
1818HWY_API Vec128<T, N>
OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
1819 for (
size_t i = 0; i <
N; i += 2) {
1820 odd.raw[i] = even.raw[i];
1825template <
typename T,
size_t N>
1832template <
typename T,
size_t N>
1840template <
typename T,
size_t N>
1845template <
typename T,
size_t N,
typename TI>
1847 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane size");
1849 CopyBytes<N * sizeof(T)>(vec.
raw, ret.raw);
1853template <
typename T,
size_t N,
typename TI>
1858template <
typename T,
size_t N>
1860 const Indices128<T, N> idx) {
1862 for (
size_t i = 0; i <
N; ++i) {
1863 ret.raw[i] =
v.raw[idx.raw[i]];
1871template <
typename T,
size_t N>
1879template <
typename T,
size_t N>
1882 for (
size_t i = 0; i <
N; ++i) {
1883 ret.
raw[i] =
v.raw[
N - 1 - i];
1888template <
typename T,
size_t N>
1891 for (
size_t i = 0; i <
N; i += 2) {
1892 ret.
raw[i + 0] =
v.raw[i + 1];
1893 ret.
raw[i + 1] =
v.raw[i + 0];
1898template <
typename T,
size_t N>
1901 for (
size_t i = 0; i <
N; i += 4) {
1902 ret.
raw[i + 0] =
v.raw[i + 3];
1903 ret.
raw[i + 1] =
v.raw[i + 2];
1904 ret.
raw[i + 2] =
v.raw[i + 1];
1905 ret.
raw[i + 3] =
v.raw[i + 0];
1910template <
typename T,
size_t N>
1913 for (
size_t i = 0; i <
N; i += 8) {
1914 ret.
raw[i + 0] =
v.raw[i + 7];
1915 ret.
raw[i + 1] =
v.raw[i + 6];
1916 ret.
raw[i + 2] =
v.raw[i + 5];
1917 ret.
raw[i + 3] =
v.raw[i + 4];
1918 ret.
raw[i + 4] =
v.raw[i + 3];
1919 ret.
raw[i + 5] =
v.raw[i + 2];
1920 ret.
raw[i + 6] =
v.raw[i + 1];
1921 ret.
raw[i + 7] =
v.raw[i + 0];
1931template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1933 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
1938template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1941 ret.
raw[3] =
v.raw[1];
1942 ret.
raw[2] =
v.raw[0];
1943 ret.
raw[1] =
v.raw[3];
1944 ret.
raw[0] =
v.raw[2];
1947template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1953template <
typename T>
1956 ret.raw[3] =
v.raw[0];
1957 ret.raw[2] =
v.raw[3];
1958 ret.raw[1] =
v.raw[2];
1959 ret.raw[0] =
v.raw[1];
1964template <
typename T>
1967 ret.raw[3] =
v.raw[2];
1968 ret.raw[2] =
v.raw[1];
1969 ret.raw[1] =
v.raw[0];
1970 ret.raw[0] =
v.raw[3];
1974template <
typename T>
1981template <
int kLane,
typename T,
size_t N>
1983 for (
size_t i = 0; i <
N; ++i) {
1984 v.raw[i] =
v.raw[kLane];
1991template <
typename T,
size_t N,
typename TI,
size_t NI>
1997 reinterpret_cast<const uint8_t*
>(indices.
raw);
2001 for (
size_t i = 0; i < NI *
sizeof(TI); ++i) {
2002 const size_t idx = idx_bytes[i];
2004 ret_bytes[i] = idx <
sizeof(T) *
N ? v_bytes[idx] : 0;
2009template <
typename T,
size_t N,
typename TI,
size_t NI>
2018template <
typename T,
size_t N>
2022 for (
size_t i = 0; i <
N / 2; ++i) {
2023 ret.
raw[2 * i + 0] = a.
raw[i];
2024 ret.
raw[2 * i + 1] = b.
raw[i];
2035template <
typename T,
size_t N>
2040 for (
size_t i = 0; i <
N / 2; ++i) {
2041 ret.
raw[2 * i + 0] = a.
raw[
N / 2 + i];
2042 ret.
raw[2 * i + 1] = b.
raw[
N / 2 + i];
2051template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2055template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2060template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2067template <
typename T,
size_t N>
2070 for (
size_t i = 0; i <
N; ++i) {
2071 or_sum |= mask.bits[i];
2076template <
typename T,
size_t N>
2079 constexpr Bits kAll =
static_cast<Bits
>(~Bits{0});
2080 Bits and_sum = kAll;
2081 for (
size_t i = 0; i <
N; ++i) {
2082 and_sum &= mask.
bits[i];
2084 return and_sum == kAll;
2088template <
typename T,
size_t N>
2092 for (
size_t i = 0; i <
N; ++i) {
2093 const size_t bit =
size_t{1} << (i & 7);
2094 const size_t idx_byte = i >> 3;
2101template <
typename T,
size_t N>
2105 if (
N > 8) bits[1] = 0;
2106 for (
size_t i = 0; i <
N; ++i) {
2107 const size_t bit =
size_t{1} << (i & 7);
2108 const size_t idx_byte = i >> 3;
2110 bits[idx_byte] =
static_cast<uint8_t
>(bits[idx_byte] | bit);
2113 return N > 8 ? 2 : 1;
2116template <
typename T,
size_t N>
2119 for (
size_t i = 0; i <
N; ++i) {
2120 count += mask.
bits[i] != 0;
2125template <
typename T,
size_t N>
2127 const Mask128<T, N> mask) {
2128 for (
size_t i = 0; i <
N; ++i) {
2129 if (mask.bits[i] != 0)
return static_cast<intptr_t
>(i);
2131 return intptr_t{-1};
2136template <
typename T>
2137struct CompressIsPartition {
2141template <
typename T,
size_t N>
2145 for (
size_t i = 0; i <
N; ++i) {
2147 ret.
raw[count++] =
v.raw[i];
2150 for (
size_t i = 0; i <
N; ++i) {
2151 if (!mask.
bits[i]) {
2152 ret.
raw[count++] =
v.raw[i];
2160template <
typename T,
size_t N>
2164 for (
size_t i = 0; i <
N; ++i) {
2165 if (!mask.
bits[i]) {
2166 ret.
raw[count++] =
v.raw[i];
2169 for (
size_t i = 0; i <
N; ++i) {
2171 ret.
raw[count++] =
v.raw[i];
2180 Mask128<uint64_t> ) {
2185template <
typename T,
size_t N>
2192template <
typename T,
size_t N>
2197 for (
size_t i = 0; i <
N; ++i) {
2199 unaligned[count++] =
v.raw[i];
2206template <
typename T,
size_t N>
2214template <
typename T,
size_t N>
2226 Vec128<bfloat16_t, 2 * N> a,
2227 Vec128<bfloat16_t, 2 * N> b,
2228 const Vec128<float, N> sum0,
2229 Vec128<float, N>& sum1) {
2230 const Rebind<bfloat16_t,
decltype(df32)> dbf16;
2242template <
typename T,
size_t N>
2245 for (
size_t i = 0; i <
N; ++i) {
2250template <
typename T,
size_t N>
2252 T min = HighestValue<T>();
2253 for (
size_t i = 0; i <
N; ++i) {
2258template <
typename T,
size_t N>
2260 T max = LowestValue<T>();
2261 for (
size_t i = 0; i <
N; ++i) {
2272 const Vec128<uint64_t> b) {
2273 alignas(16) uint64_t mul[2];
2275 return Load(Full128<uint64_t>(), mul);
2279 const Vec128<uint64_t> b) {
2280 alignas(16) uint64_t mul[2];
2281 const Half<Full128<uint64_t>> d2;
2284 return Load(Full128<uint64_t>(), mul);
2317HWY_API auto Eq(V a, V b) ->
decltype(a == b) {
2321HWY_API auto Ne(V a, V b) ->
decltype(a == b) {
2325HWY_API auto Lt(V a, V b) ->
decltype(a == b) {
2330HWY_API auto Gt(V a, V b) ->
decltype(a == b) {
2334HWY_API auto Ge(V a, V b) ->
decltype(a == b) {
2339HWY_API auto Le(V a, V b) ->
decltype(a == b) {
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DASSERT(condition)
Definition: base.h:191
Definition: arm_neon-inl.h:804
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:806
static HWY_INLINE Raw FromBool(bool b)
Definition: emu128-inl.h:73
Raw bits[16/sizeof(T)]
Definition: emu128-inl.h:78
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128()=default
Vec128(const Vec128 &)=default
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: emu128-inl.h:42
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: emu128-inl.h:48
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: emu128-inl.h:57
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: emu128-inl.h:54
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: emu128-inl.h:39
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: emu128-inl.h:51
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: emu128-inl.h:45
HWY_INLINE constexpr T IncrementWithWraparound(T t)
Definition: emu128-inl.h:139
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE void StoreU16ToF16(const uint16_t val, hwy::float16_t *HWY_RESTRICT to)
Definition: emu128-inl.h:1464
HWY_INLINE uint16_t U16FromF16(const hwy::float16_t *HWY_RESTRICT from)
Definition: emu128-inl.h:1473
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:831
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:839
constexpr float HighestValue< float >()
Definition: base.h:580
constexpr float LowestValue< float >()
Definition: base.h:567
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
HWY_API constexpr T LimitsMax()
Definition: base.h:548
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: arm_neon-inl.h:3883
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3884
Definition: ops/shared-inl.h:40
Definition: emu128-inl.h:84
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: emu128-inl.h:86
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v) const
Definition: emu128-inl.h:448
Definition: emu128-inl.h:438
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v) const
Definition: emu128-inl.h:440