Grok 10.0.3
scalar-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Single-element vectors and operations.
17// External include guard in highway.h - see comment there.
18
19#include <stddef.h>
20#include <stdint.h>
21
22#include "hwy/base.h"
23#include "hwy/ops/shared-inl.h"
24
26namespace hwy {
27namespace HWY_NAMESPACE {
28
29// Single instruction, single data.
30template <typename T>
32
33// (Wrapper class required for overloading comparison operators.)
34template <typename T>
35struct Vec1 {
36 HWY_INLINE Vec1() = default;
37 Vec1(const Vec1&) = default;
38 Vec1& operator=(const Vec1&) = default;
39 HWY_INLINE explicit Vec1(const T t) : raw(t) {}
40
42 return *this = (*this * other);
43 }
45 return *this = (*this / other);
46 }
48 return *this = (*this + other);
49 }
51 return *this = (*this - other);
52 }
54 return *this = (*this & other);
55 }
57 return *this = (*this | other);
58 }
60 return *this = (*this ^ other);
61 }
62
63 T raw;
64};
65
66// 0 or FF..FF, same size as Vec1.
67template <typename T>
68class Mask1 {
70
71 public:
72 static HWY_INLINE Mask1<T> FromBool(bool b) {
73 Mask1<T> mask;
74 mask.bits = b ? static_cast<Raw>(~Raw{0}) : 0;
75 return mask;
76 }
77
79};
80
81namespace detail {
82
83// Deduce Sisd<T> from Vec1<T>
84struct Deduce1 {
85 template <typename T>
87 return Sisd<T>();
88 }
89};
90
91} // namespace detail
92
93template <class V>
94using DFromV = decltype(detail::Deduce1()(V()));
95
96template <class V>
97using TFromV = TFromD<DFromV<V>>;
98
99// ------------------------------ BitCast
100
101template <typename T, typename FromT>
103 static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined");
104 T to;
105 CopyBytes<sizeof(FromT)>(&v.raw, &to);
106 return Vec1<T>(to);
107}
108
109// ------------------------------ Set
110
111template <typename T>
113 return Vec1<T>(T(0));
114}
115
116template <typename T, typename T2>
117HWY_API Vec1<T> Set(Sisd<T> /* tag */, const T2 t) {
118 return Vec1<T>(static_cast<T>(t));
119}
120
121template <typename T>
123 return Zero(d);
124}
125
126template <typename T, typename T2>
127HWY_API Vec1<T> Iota(const Sisd<T> /* tag */, const T2 first) {
128 return Vec1<T>(static_cast<T>(first));
129}
130
131// ================================================== LOGICAL
132
133// ------------------------------ Not
134
135template <typename T>
137 using TU = MakeUnsigned<T>;
138 const Sisd<TU> du;
139 return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, v).raw)));
140}
141
142// ------------------------------ And
143
144template <typename T>
145HWY_API Vec1<T> And(const Vec1<T> a, const Vec1<T> b) {
146 using TU = MakeUnsigned<T>;
147 const Sisd<TU> du;
148 return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw));
149}
150template <typename T>
152 return And(a, b);
153}
154
155// ------------------------------ AndNot
156
157template <typename T>
159 using TU = MakeUnsigned<T>;
160 const Sisd<TU> du;
161 return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, a).raw &
162 BitCast(du, b).raw)));
163}
164
165// ------------------------------ Or
166
167template <typename T>
168HWY_API Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) {
169 using TU = MakeUnsigned<T>;
170 const Sisd<TU> du;
171 return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw));
172}
173template <typename T>
175 return Or(a, b);
176}
177
178// ------------------------------ Xor
179
180template <typename T>
181HWY_API Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) {
182 using TU = MakeUnsigned<T>;
183 const Sisd<TU> du;
184 return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw));
185}
186template <typename T>
188 return Xor(a, b);
189}
190
191// ------------------------------ Or3
192
193template <typename T>
195 return Or(o1, Or(o2, o3));
196}
197
198// ------------------------------ OrAnd
199
200template <typename T>
201HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) {
202 return Or(o, And(a1, a2));
203}
204
205// ------------------------------ IfVecThenElse
206
207template <typename T>
209 return IfThenElse(MaskFromVec(mask), yes, no);
210}
211
212// ------------------------------ CopySign
213
214template <typename T>
215HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) {
216 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
217 const auto msb = SignBit(Sisd<T>());
218 return Or(AndNot(msb, magn), And(msb, sign));
219}
220
221template <typename T>
223 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
224 return Or(abs, And(SignBit(Sisd<T>()), sign));
225}
226
227// ------------------------------ BroadcastSignBit
228
229template <typename T>
231 // This is used inside ShiftRight, so we cannot implement in terms of it.
232 return v.raw < 0 ? Vec1<T>(T(-1)) : Vec1<T>(0);
233}
234
235// ------------------------------ PopulationCount
236
237#ifdef HWY_NATIVE_POPCNT
238#undef HWY_NATIVE_POPCNT
239#else
240#define HWY_NATIVE_POPCNT
241#endif
242
243template <typename T>
245 return Vec1<T>(static_cast<T>(PopCount(v.raw)));
246}
247
248// ------------------------------ Mask
249
250template <typename TFrom, typename TTo>
252 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
253 return Mask1<TTo>{m.bits};
254}
255
256// v must be 0 or FF..FF.
257template <typename T>
259 Mask1<T> mask;
260 CopyBytes<sizeof(mask.bits)>(&v.raw, &mask.bits);
261 return mask;
262}
263
264template <typename T>
266 Vec1<T> v;
267 CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
268 return v;
269}
270
271template <typename T>
272Vec1<T> VecFromMask(Sisd<T> /* tag */, const Mask1<T> mask) {
273 Vec1<T> v;
274 CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
275 return v;
276}
277
278template <typename T>
279HWY_API Mask1<T> FirstN(Sisd<T> /*tag*/, size_t n) {
280 return Mask1<T>::FromBool(n != 0);
281}
282
283// Returns mask ? yes : no.
284template <typename T>
286 const Vec1<T> no) {
287 return mask.bits ? yes : no;
288}
289
290template <typename T>
292 return mask.bits ? yes : Vec1<T>(0);
293}
294
295template <typename T>
297 return mask.bits ? Vec1<T>(0) : no;
298}
299
300template <typename T>
302 return v.raw < 0 ? yes : no;
303}
304
305template <typename T>
307 return v.raw < 0 ? Vec1<T>(0) : v;
308}
309
310// ------------------------------ Mask logical
311
312template <typename T>
314 return MaskFromVec(Not(VecFromMask(Sisd<T>(), m)));
315}
316
317template <typename T>
319 const Sisd<T> d;
320 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
321}
322
323template <typename T>
325 const Sisd<T> d;
326 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
327}
328
329template <typename T>
331 const Sisd<T> d;
332 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
333}
334
335template <typename T>
337 const Sisd<T> d;
338 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
339}
340
341// ================================================== SHIFTS
342
343// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
344
345template <int kBits, typename T>
347 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
348 return Vec1<T>(
349 static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits));
350}
351
352template <int kBits, typename T>
354 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
355#if __cplusplus >= 202002L
356 // Signed right shift is now guaranteed to be arithmetic (rounding toward
357 // negative infinity, i.e. shifting in the sign bit).
358 return Vec1<T>(static_cast<T>(v.raw >> kBits));
359#else
360 if (IsSigned<T>()) {
361 // Emulate arithmetic shift using only logical (unsigned) shifts, because
362 // signed shifts are still implementation-defined.
363 using TU = hwy::MakeUnsigned<T>;
364 const Sisd<TU> du;
365 const TU shifted = BitCast(du, v).raw >> kBits;
366 const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
367 const size_t sign_shift =
368 static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
369 const TU upper = static_cast<TU>(sign << sign_shift);
370 return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
371 } else { // T is unsigned
372 return Vec1<T>(static_cast<T>(v.raw >> kBits));
373 }
374#endif
375}
376
377// ------------------------------ RotateRight (ShiftRight)
378
379namespace detail {
380
381// For partial specialization: kBits == 0 results in an invalid shift count
382template <int kBits>
383struct RotateRight {
384 template <typename T>
386 return Or(ShiftRight<kBits>(v), ShiftLeft<sizeof(T) * 8 - kBits>(v));
387 }
388};
389
390template <>
391struct RotateRight<0> {
392 template <typename T>
394 return v;
395 }
396};
397
398} // namespace detail
399
400template <int kBits, typename T>
402 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
404}
405
406// ------------------------------ ShiftLeftSame (BroadcastSignBit)
407
408template <typename T>
410 return Vec1<T>(
411 static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits));
412}
413
414template <typename T>
416#if __cplusplus >= 202002L
417 // Signed right shift is now guaranteed to be arithmetic (rounding toward
418 // negative infinity, i.e. shifting in the sign bit).
419 return Vec1<T>(static_cast<T>(v.raw >> bits));
420#else
421 if (IsSigned<T>()) {
422 // Emulate arithmetic shift using only logical (unsigned) shifts, because
423 // signed shifts are still implementation-defined.
424 using TU = hwy::MakeUnsigned<T>;
425 const Sisd<TU> du;
426 const TU shifted = BitCast(du, v).raw >> bits;
427 const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
428 const size_t sign_shift =
429 static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
430 const TU upper = static_cast<TU>(sign << sign_shift);
431 return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
432 } else { // T is unsigned
433 return Vec1<T>(static_cast<T>(v.raw >> bits));
434 }
435#endif
436}
437
438// ------------------------------ Shl
439
440// Single-lane => same as ShiftLeftSame except for the argument type.
441template <typename T>
443 return ShiftLeftSame(v, static_cast<int>(bits.raw));
444}
445
446template <typename T>
448 return ShiftRightSame(v, static_cast<int>(bits.raw));
449}
450
451// ================================================== ARITHMETIC
452
453template <typename T>
455 const uint64_t a64 = static_cast<uint64_t>(a.raw);
456 const uint64_t b64 = static_cast<uint64_t>(b.raw);
457 return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
458}
460 return Vec1<float>(a.raw + b.raw);
461}
463 return Vec1<double>(a.raw + b.raw);
464}
465
466template <typename T>
468 const uint64_t a64 = static_cast<uint64_t>(a.raw);
469 const uint64_t b64 = static_cast<uint64_t>(b.raw);
470 return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
471}
473 return Vec1<float>(a.raw - b.raw);
474}
476 return Vec1<double>(a.raw - b.raw);
477}
478
479// ------------------------------ SumsOf8
480
482 return Vec1<uint64_t>(v.raw);
483}
484
485// ------------------------------ SaturatedAdd
486
487// Returns a + b clamped to the destination range.
488
489// Unsigned
491 const Vec1<uint8_t> b) {
492 return Vec1<uint8_t>(
493 static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
494}
496 const Vec1<uint16_t> b) {
497 return Vec1<uint16_t>(
498 static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)));
499}
500
501// Signed
503 return Vec1<int8_t>(
504 static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
505}
507 const Vec1<int16_t> b) {
508 return Vec1<int16_t>(
509 static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)));
510}
511
512// ------------------------------ Saturating subtraction
513
514// Returns a - b clamped to the destination range.
515
516// Unsigned
518 const Vec1<uint8_t> b) {
519 return Vec1<uint8_t>(
520 static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
521}
523 const Vec1<uint16_t> b) {
524 return Vec1<uint16_t>(
525 static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)));
526}
527
528// Signed
530 return Vec1<int8_t>(
531 static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
532}
534 const Vec1<int16_t> b) {
535 return Vec1<int16_t>(
536 static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)));
537}
538
539// ------------------------------ Average
540
541// Returns (a + b + 1) / 2
542
544 const Vec1<uint8_t> b) {
545 return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
546}
548 const Vec1<uint16_t> b) {
549 return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
550}
551
552// ------------------------------ Absolute value
553
554template <typename T>
556 const T i = a.raw;
557 return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(-i);
558}
560 return Vec1<float>(std::abs(a.raw));
561}
563 return Vec1<double>(std::abs(a.raw));
564}
565
566// ------------------------------ min/max
567
568template <typename T, HWY_IF_NOT_FLOAT(T)>
569HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
570 return Vec1<T>(HWY_MIN(a.raw, b.raw));
571}
572
573template <typename T, HWY_IF_FLOAT(T)>
574HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
575 if (std::isnan(a.raw)) return b;
576 if (std::isnan(b.raw)) return a;
577 return Vec1<T>(HWY_MIN(a.raw, b.raw));
578}
579
580template <typename T, HWY_IF_NOT_FLOAT(T)>
581HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
582 return Vec1<T>(HWY_MAX(a.raw, b.raw));
583}
584
585template <typename T, HWY_IF_FLOAT(T)>
586HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
587 if (std::isnan(a.raw)) return b;
588 if (std::isnan(b.raw)) return a;
589 return Vec1<T>(HWY_MAX(a.raw, b.raw));
590}
591
592// ------------------------------ Floating-point negate
593
594template <typename T, HWY_IF_FLOAT(T)>
596 return Xor(v, SignBit(Sisd<T>()));
597}
598
599template <typename T, HWY_IF_NOT_FLOAT(T)>
600HWY_API Vec1<T> Neg(const Vec1<T> v) {
601 return Zero(Sisd<T>()) - v;
602}
603
604// ------------------------------ mul/div
605
606template <typename T, HWY_IF_FLOAT(T)>
608 return Vec1<T>(static_cast<T>(double(a.raw) * b.raw));
609}
610
611template <typename T, HWY_IF_SIGNED(T)>
612HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
613 return Vec1<T>(static_cast<T>(int64_t(a.raw) * b.raw));
614}
615
616template <typename T, HWY_IF_UNSIGNED(T)>
617HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
618 return Vec1<T>(static_cast<T>(uint64_t(a.raw) * b.raw));
619}
620
621template <typename T>
623 return Vec1<T>(a.raw / b.raw);
624}
625
626// Returns the upper 16 bits of a * b in each lane.
628 return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
629}
631 // Cast to uint32_t first to prevent overflow. Otherwise the result of
632 // uint16_t * uint16_t is in "int" which may overflow. In practice the result
633 // is the same but this way it is also defined.
634 return Vec1<uint16_t>(static_cast<uint16_t>(
635 (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
636}
637
639 return Vec1<int16_t>(static_cast<int16_t>((2 * a.raw * b.raw + 32768) >> 16));
640}
641
642// Multiplies even lanes (0, 2 ..) and returns the double-wide result.
644 const int64_t a64 = a.raw;
645 return Vec1<int64_t>(a64 * b.raw);
646}
648 const uint64_t a64 = a.raw;
649 return Vec1<uint64_t>(a64 * b.raw);
650}
651
652// Approximate reciprocal
654 // Zero inputs are allowed, but callers are responsible for replacing the
655 // return value with something else (typically using IfThenElse). This check
656 // avoids a ubsan error. The return value is arbitrary.
657 if (v.raw == 0.0f) return Vec1<float>(0.0f);
658 return Vec1<float>(1.0f / v.raw);
659}
660
661// Absolute value of difference.
663 return Abs(a - b);
664}
665
666// ------------------------------ Floating-point multiply-add variants
667
668template <typename T>
669HWY_API Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> add) {
670 return mul * x + add;
671}
672
673template <typename T>
675 const Vec1<T> add) {
676 return add - mul * x;
677}
678
679template <typename T>
680HWY_API Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> sub) {
681 return mul * x - sub;
682}
683
684template <typename T>
686 const Vec1<T> sub) {
687 return Neg(mul) * x - sub;
688}
689
690// ------------------------------ Floating-point square root
691
692// Approximate reciprocal square root
694 float f = v.raw;
695 const float half = f * 0.5f;
696 uint32_t bits;
697 CopyBytes<4>(&f, &bits);
698 // Initial guess based on log2(f)
699 bits = 0x5F3759DF - (bits >> 1);
700 CopyBytes<4>(&bits, &f);
701 // One Newton-Raphson iteration
702 return Vec1<float>(f * (1.5f - (half * f * f)));
703}
704
705// Square root
707 return Vec1<float>(std::sqrt(v.raw));
708}
710 return Vec1<double>(std::sqrt(v.raw));
711}
712
713// ------------------------------ Floating-point rounding
714
715template <typename T>
717 using TI = MakeSigned<T>;
718 if (!(Abs(v).raw < MantissaEnd<T>())) { // Huge or NaN
719 return v;
720 }
721 const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
722 const TI rounded = static_cast<TI>(v.raw + bias);
723 if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
724 // Round to even
725 if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
726 return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
727 }
728 return Vec1<T>(static_cast<T>(rounded));
729}
730
731// Round-to-nearest even.
733 using T = float;
734 using TI = int32_t;
735
736 const T abs = Abs(v).raw;
737 const bool signbit = std::signbit(v.raw);
738
739 if (!(abs < MantissaEnd<T>())) { // Huge or NaN
740 // Check if too large to cast or NaN
741 if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
742 return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
743 }
744 return Vec1<int32_t>(static_cast<TI>(v.raw));
745 }
746 const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
747 const TI rounded = static_cast<TI>(v.raw + bias);
748 if (rounded == 0) return Vec1<int32_t>(0);
749 // Round to even
750 if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
751 return Vec1<TI>(rounded - (signbit ? -1 : 1));
752 }
753 return Vec1<TI>(rounded);
754}
755
756template <typename T>
758 using TI = MakeSigned<T>;
759 if (!(Abs(v).raw <= MantissaEnd<T>())) { // Huge or NaN
760 return v;
761 }
762 const TI truncated = static_cast<TI>(v.raw);
763 if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v);
764 return Vec1<T>(static_cast<T>(truncated));
765}
766
767template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
768 class V>
769V Ceiling(const V v) {
770 const Bits kExponentMask = (1ull << kExponentBits) - 1;
771 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
772 const Bits kBias = kExponentMask / 2;
773
774 Float f = v.raw;
775 const bool positive = f > Float(0.0);
776
777 Bits bits;
778 CopyBytes<sizeof(Bits)>(&v, &bits);
779
780 const int exponent =
781 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
782 // Already an integer.
783 if (exponent >= kMantissaBits) return v;
784 // |v| <= 1 => 0 or 1.
785 if (exponent < 0) return positive ? V(1) : V(-0.0);
786
787 const Bits mantissa_mask = kMantissaMask >> exponent;
788 // Already an integer
789 if ((bits & mantissa_mask) == 0) return v;
790
791 // Clear fractional bits and round up
792 if (positive) bits += (kMantissaMask + 1) >> exponent;
793 bits &= ~mantissa_mask;
794
795 CopyBytes<sizeof(Bits)>(&bits, &f);
796 return V(f);
797}
798
799template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
800 class V>
801V Floor(const V v) {
802 const Bits kExponentMask = (1ull << kExponentBits) - 1;
803 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
804 const Bits kBias = kExponentMask / 2;
805
806 Float f = v.raw;
807 const bool negative = f < Float(0.0);
808
809 Bits bits;
810 CopyBytes<sizeof(Bits)>(&v, &bits);
811
812 const int exponent =
813 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
814 // Already an integer.
815 if (exponent >= kMantissaBits) return v;
816 // |v| <= 1 => -1 or 0.
817 if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0));
818
819 const Bits mantissa_mask = kMantissaMask >> exponent;
820 // Already an integer
821 if ((bits & mantissa_mask) == 0) return v;
822
823 // Clear fractional bits and round down
824 if (negative) bits += (kMantissaMask + 1) >> exponent;
825 bits &= ~mantissa_mask;
826
827 CopyBytes<sizeof(Bits)>(&bits, &f);
828 return V(f);
829}
830
831// Toward +infinity, aka ceiling
833 return Ceiling<float, uint32_t, 23, 8>(v);
834}
836 return Ceiling<double, uint64_t, 52, 11>(v);
837}
838
839// Toward -infinity, aka floor
841 return Floor<float, uint32_t, 23, 8>(v);
842}
844 return Floor<double, uint64_t, 52, 11>(v);
845}
846
847// ================================================== COMPARE
848
849template <typename T>
851 return Mask1<T>::FromBool(a.raw == b.raw);
852}
853
854template <typename T>
856 return Mask1<T>::FromBool(a.raw != b.raw);
857}
858
859template <typename T>
861 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
862 return (v & bit) == bit;
863}
864
865template <typename T>
867 return Mask1<T>::FromBool(a.raw < b.raw);
868}
869template <typename T>
871 return Mask1<T>::FromBool(a.raw > b.raw);
872}
873
874template <typename T>
876 return Mask1<T>::FromBool(a.raw <= b.raw);
877}
878template <typename T>
880 return Mask1<T>::FromBool(a.raw >= b.raw);
881}
882
883// ------------------------------ Floating-point classification (==)
884
885template <typename T>
887 // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
888 MakeUnsigned<T> bits;
889 memcpy(&bits, &v, sizeof(v));
890 bits += bits;
891 bits >>= 1; // clear sign bit
892 // NaN if all exponent bits are set and the mantissa is not zero.
893 return Mask1<T>::FromBool(bits > ExponentMask<T>());
894}
895
897 const Sisd<float> d;
898 const RebindToUnsigned<decltype(d)> du;
899 const Vec1<uint32_t> vu = BitCast(du, v);
900 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
901 return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u));
902}
904 const Sisd<double> d;
905 const RebindToUnsigned<decltype(d)> du;
906 const Vec1<uint64_t> vu = BitCast(du, v);
907 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
908 return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull));
909}
910
912 const Vec1<uint32_t> vu = BitCast(Sisd<uint32_t>(), v);
913 // Shift left to clear the sign bit, check whether exponent != max value.
914 return Mask1<float>::FromBool((vu.raw << 1) < 0xFF000000u);
915}
917 const Vec1<uint64_t> vu = BitCast(Sisd<uint64_t>(), v);
918 // Shift left to clear the sign bit, check whether exponent != max value.
919 return Mask1<double>::FromBool((vu.raw << 1) < 0xFFE0000000000000ull);
920}
921
922// ================================================== MEMORY
923
924// ------------------------------ Load
925
926template <typename T>
927HWY_API Vec1<T> Load(Sisd<T> /* tag */, const T* HWY_RESTRICT aligned) {
928 T t;
929 CopyBytes<sizeof(T)>(aligned, &t);
930 return Vec1<T>(t);
931}
932
933template <typename T>
935 const T* HWY_RESTRICT aligned) {
936 return IfThenElseZero(m, Load(d, aligned));
937}
938
939template <typename T>
941 return Load(d, p);
942}
943
944// In some use cases, "load single lane" is sufficient; otherwise avoid this.
945template <typename T>
947 return Load(d, aligned);
948}
949
950// ------------------------------ Store
951
952template <typename T>
953HWY_API void Store(const Vec1<T> v, Sisd<T> /* tag */,
954 T* HWY_RESTRICT aligned) {
955 CopyBytes<sizeof(T)>(&v.raw, aligned);
956}
957
958template <typename T>
960 return Store(v, d, p);
961}
962
963template <typename T>
965 T* HWY_RESTRICT p) {
966 if (!m.bits) return;
967 StoreU(v, d, p);
968}
969
970// ------------------------------ LoadInterleaved2/3/4
971
972// Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2.
973#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
974#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
975#else
976#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
977#endif
978
979template <typename T>
981 Vec1<T>& v0, Vec1<T>& v1) {
982 v0 = LoadU(d, unaligned + 0);
983 v1 = LoadU(d, unaligned + 1);
984}
985
986template <typename T>
988 Vec1<T>& v0, Vec1<T>& v1, Vec1<T>& v2) {
989 v0 = LoadU(d, unaligned + 0);
990 v1 = LoadU(d, unaligned + 1);
991 v2 = LoadU(d, unaligned + 2);
992}
993
994template <typename T>
996 Vec1<T>& v0, Vec1<T>& v1, Vec1<T>& v2,
997 Vec1<T>& v3) {
998 v0 = LoadU(d, unaligned + 0);
999 v1 = LoadU(d, unaligned + 1);
1000 v2 = LoadU(d, unaligned + 2);
1001 v3 = LoadU(d, unaligned + 3);
1002}
1003
1004// ------------------------------ StoreInterleaved2/3/4
1005
1006template <typename T>
1008 T* HWY_RESTRICT unaligned) {
1009 StoreU(v0, d, unaligned + 0);
1010 StoreU(v1, d, unaligned + 1);
1011}
1012
1013template <typename T>
1015 const Vec1<T> v2, Sisd<T> d,
1016 T* HWY_RESTRICT unaligned) {
1017 StoreU(v0, d, unaligned + 0);
1018 StoreU(v1, d, unaligned + 1);
1019 StoreU(v2, d, unaligned + 2);
1020}
1021
1022template <typename T>
1024 const Vec1<T> v2, const Vec1<T> v3, Sisd<T> d,
1025 T* HWY_RESTRICT unaligned) {
1026 StoreU(v0, d, unaligned + 0);
1027 StoreU(v1, d, unaligned + 1);
1028 StoreU(v2, d, unaligned + 2);
1029 StoreU(v3, d, unaligned + 3);
1030}
1031
1032// ------------------------------ Stream
1033
1034template <typename T>
1035HWY_API void Stream(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT aligned) {
1036 return Store(v, d, aligned);
1037}
1038
1039// ------------------------------ Scatter
1040
1041template <typename T, typename Offset>
1043 const Vec1<Offset> offset) {
1044 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1045 uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
1046 return Store(v, d, reinterpret_cast<T*>(base8));
1047}
1048
1049template <typename T, typename Index>
1051 const Vec1<Index> index) {
1052 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1053 return Store(v, d, base + index.raw);
1054}
1055
1056// ------------------------------ Gather
1057
1058template <typename T, typename Offset>
1060 const Vec1<Offset> offset) {
1061 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1062 const intptr_t addr =
1063 reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw);
1064 return Load(d, reinterpret_cast<const T*>(addr));
1065}
1066
1067template <typename T, typename Index>
1069 const Vec1<Index> index) {
1070 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1071 return Load(d, base + index.raw);
1072}
1073
1074// ================================================== CONVERT
1075
1076// ConvertTo and DemoteTo with floating-point input and integer output truncate
1077// (rounding toward zero).
1078
1079template <typename FromT, typename ToT>
1081 static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting");
1082 // For bits Y > X, floatX->floatY and intX->intY are always representable.
1083 return Vec1<ToT>(static_cast<ToT>(from.raw));
1084}
1085
1086// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here,
1087// so we overload for FromT=double and ToT={float,int32_t}.
1089 // Prevent ubsan errors when converting float to narrower integer/float
1090 if (std::isinf(from.raw) ||
1091 std::fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
1092 return Vec1<float>(std::signbit(from.raw) ? LowestValue<float>()
1094 }
1095 return Vec1<float>(static_cast<float>(from.raw));
1096}
1098 // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
1099 if (std::isinf(from.raw) ||
1100 std::fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
1101 return Vec1<int32_t>(std::signbit(from.raw) ? LowestValue<int32_t>()
1102 : HighestValue<int32_t>());
1103 }
1104 return Vec1<int32_t>(static_cast<int32_t>(from.raw));
1105}
1106
1107template <typename FromT, typename ToT>
1109 static_assert(!IsFloat<FromT>(), "FromT=double are handled above");
1110 static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
1111
1112 // Int to int: choose closest value in ToT to `from` (avoids UB)
1113 from.raw = HWY_MIN(HWY_MAX(LimitsMin<ToT>(), from.raw), LimitsMax<ToT>());
1114 return Vec1<ToT>(static_cast<ToT>(from.raw));
1115}
1116
1118#if HWY_NATIVE_FLOAT16
1119 uint16_t bits16;
1120 CopyBytes<2>(&v.raw, &bits16);
1121#else
1122 const uint16_t bits16 = v.raw.bits;
1123#endif
1124 const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
1125 const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1126 const uint32_t mantissa = bits16 & 0x3FF;
1127
1128 // Subnormal or zero
1129 if (biased_exp == 0) {
1130 const float subnormal =
1131 (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
1132 return Vec1<float>(sign ? -subnormal : subnormal);
1133 }
1134
1135 // Normalized: convert the representation directly (faster than ldexp/tables).
1136 const uint32_t biased_exp32 = biased_exp + (127 - 15);
1137 const uint32_t mantissa32 = mantissa << (23 - 10);
1138 const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1139 float out;
1140 CopyBytes<4>(&bits32, &out);
1141 return Vec1<float>(out);
1142}
1143
1145 return Set(d, F32FromBF16(v.raw));
1146}
1147
1149 const Vec1<float> v) {
1150 uint32_t bits32;
1151 CopyBytes<4>(&v.raw, &bits32);
1152 const uint32_t sign = bits32 >> 31;
1153 const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1154 const uint32_t mantissa32 = bits32 & 0x7FFFFF;
1155
1156 const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
1157
1158 // Tiny or zero => zero.
1159 Vec1<float16_t> out;
1160 if (exp < -24) {
1161#if HWY_NATIVE_FLOAT16
1162 const uint16_t zero = 0;
1163 CopyBytes<2>(&zero, &out.raw);
1164#else
1165 out.raw.bits = 0;
1166#endif
1167 return out;
1168 }
1169
1170 uint32_t biased_exp16, mantissa16;
1171
1172 // exp = [-24, -15] => subnormal
1173 if (exp < -14) {
1174 biased_exp16 = 0;
1175 const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
1176 HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
1177 mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
1178 (mantissa32 >> (13 + sub_exp)));
1179 } else {
1180 // exp = [-14, 15]
1181 biased_exp16 = static_cast<uint32_t>(exp + 15);
1182 HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1183 mantissa16 = mantissa32 >> 13;
1184 }
1185
1186 HWY_DASSERT(mantissa16 < 1024);
1187 const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1188 HWY_DASSERT(bits16 < 0x10000);
1189#if HWY_NATIVE_FLOAT16
1190 const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
1191 CopyBytes<2>(&narrowed, &out.raw);
1192#else
1193 out.raw.bits = static_cast<uint16_t>(bits16);
1194#endif
1195 return out;
1196}
1197
1199 return Set(d, BF16FromF32(v.raw));
1200}
1201
1202template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
1204 static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1205 // float## -> int##: return closest representable value. We cannot exactly
1206 // represent LimitsMax<ToT> in FromT, so use double.
1207 const double f = static_cast<double>(from.raw);
1208 if (std::isinf(from.raw) ||
1209 std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
1210 return Vec1<ToT>(std::signbit(from.raw) ? LimitsMin<ToT>()
1211 : LimitsMax<ToT>());
1212 }
1213 return Vec1<ToT>(static_cast<ToT>(from.raw));
1214}
1215
1216template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
1217HWY_API Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
1218 static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1219 // int## -> float##: no check needed
1220 return Vec1<ToT>(static_cast<ToT>(from.raw));
1221}
1222
1224 return DemoteTo(Sisd<uint8_t>(), v);
1225}
1226
1227// ================================================== COMBINE
1228// UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported.
1229
1230template <typename T>
1232 return v;
1233}
1234
1235template <typename T>
1237 return v;
1238}
1239
1240// ================================================== SWIZZLE
1241
1242template <typename T>
1244 return v.raw;
1245}
1246
1247template <typename T>
1248HWY_API T ExtractLane(const Vec1<T> v, size_t i) {
1249 HWY_DASSERT(i == 0);
1250 (void)i;
1251 return v.raw;
1252}
1253
1254template <typename T>
1256 HWY_DASSERT(i == 0);
1257 (void)i;
1258 v.raw = t;
1259 return v;
1260}
1261
1262template <typename T>
1264 return v;
1265}
1266// DupOdd is unsupported.
1267
1268template <typename T>
1270 return even;
1271}
1272
1273template <typename T>
1275 return even;
1276}
1277
1278// ------------------------------ SwapAdjacentBlocks
1279
1280template <typename T>
1282 return v;
1283}
1284
1285// ------------------------------ TableLookupLanes
1286
1287// Returned by SetTableIndices for use by TableLookupLanes.
1288template <typename T>
1289struct Indices1 {
1291};
1292
1293template <typename T, typename TI>
1295 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size");
1296 HWY_DASSERT(vec.raw == 0);
1297 return Indices1<T>{vec.raw};
1298}
1299
1300template <typename T, typename TI>
1302 return IndicesFromVec(d, LoadU(Sisd<TI>(), idx));
1303}
1304
1305template <typename T>
1307 return v;
1308}
1309
1310// ------------------------------ ReverseBlocks
1311
1312// Single block: no change
1313template <typename T>
1315 return v;
1316}
1317
1318// ------------------------------ Reverse
1319
1320template <typename T>
1322 return v;
1323}
1324
1325// Must not be called:
1326template <typename T>
1328 return v;
1329}
1330
1331template <typename T>
1333 return v;
1334}
1335
1336template <typename T>
1338 return v;
1339}
1340
1341// ================================================== BLOCKWISE
1342// Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
1343
1344// ------------------------------ Broadcast/splat any lane
1345
1346template <int kLane, typename T>
1348 static_assert(kLane == 0, "Scalar only has one lane");
1349 return v;
1350}
1351
1352// ------------------------------ TableLookupBytes, TableLookupBytesOr0
1353
1354template <typename T, typename TI>
1356 uint8_t in_bytes[sizeof(T)];
1357 uint8_t idx_bytes[sizeof(T)];
1358 uint8_t out_bytes[sizeof(T)];
1359 CopyBytes<sizeof(T)>(&in, &in_bytes);
1360 CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1361 for (size_t i = 0; i < sizeof(T); ++i) {
1362 out_bytes[i] = in_bytes[idx_bytes[i]];
1363 }
1364 TI out;
1365 CopyBytes<sizeof(TI)>(&out_bytes, &out);
1366 return Vec1<TI>{out};
1367}
1368
1369template <typename T, typename TI>
1371 uint8_t in_bytes[sizeof(T)];
1372 uint8_t idx_bytes[sizeof(T)];
1373 uint8_t out_bytes[sizeof(T)];
1374 CopyBytes<sizeof(T)>(&in, &in_bytes);
1375 CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1376 for (size_t i = 0; i < sizeof(T); ++i) {
1377 out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
1378 }
1379 TI out;
1380 CopyBytes<sizeof(TI)>(&out_bytes, &out);
1381 return Vec1<TI>{out};
1382}
1383
1384// ------------------------------ ZipLower
1385
1387 return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t(b.raw) << 8) + a.raw));
1388}
1390 const Vec1<uint16_t> b) {
1391 return Vec1<uint32_t>((uint32_t(b.raw) << 16) + a.raw);
1392}
1394 const Vec1<uint32_t> b) {
1395 return Vec1<uint64_t>((uint64_t(b.raw) << 32) + a.raw);
1396}
1398 return Vec1<int16_t>(static_cast<int16_t>((int32_t(b.raw) << 8) + a.raw));
1399}
1401 return Vec1<int32_t>((int32_t(b.raw) << 16) + a.raw);
1402}
1404 return Vec1<int64_t>((int64_t(b.raw) << 32) + a.raw);
1405}
1406
1407template <typename T, typename TW = MakeWide<T>, class VW = Vec1<TW>>
1409 return VW(static_cast<TW>((TW{b.raw} << (sizeof(T) * 8)) + a.raw));
1410}
1411
1412// ================================================== MASK
1413
1414template <typename T>
1415HWY_API bool AllFalse(Sisd<T> /* tag */, const Mask1<T> mask) {
1416 return mask.bits == 0;
1417}
1418
1419template <typename T>
1420HWY_API bool AllTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1421 return mask.bits != 0;
1422}
1423
1424// `p` points to at least 8 readable bytes, not all of which need be valid.
1425template <typename T>
1427 const uint8_t* HWY_RESTRICT bits) {
1428 return Mask1<T>::FromBool((bits[0] & 1) != 0);
1429}
1430
1431// `p` points to at least 8 writable bytes.
1432template <typename T>
1433HWY_API size_t StoreMaskBits(Sisd<T> d, const Mask1<T> mask, uint8_t* bits) {
1434 *bits = AllTrue(d, mask);
1435 return 1;
1436}
1437
1438template <typename T>
1439HWY_API size_t CountTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1440 return mask.bits == 0 ? 0 : 1;
1441}
1442
1443template <typename T>
1444HWY_API intptr_t FindFirstTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1445 return mask.bits == 0 ? -1 : 0;
1446}
1447
1448// ------------------------------ Compress, CompressBits
1449
1450template <typename T>
1451struct CompressIsPartition {
1452 enum { value = 1 };
1453};
1454
1455template <typename T>
1457 // A single lane is already partitioned by definition.
1458 return v;
1459}
1460
1461template <typename T>
1463 // A single lane is already partitioned by definition.
1464 return v;
1465}
1466
1467// ------------------------------ CompressStore
1468template <typename T>
1470 T* HWY_RESTRICT unaligned) {
1471 StoreU(Compress(v, mask), d, unaligned);
1472 return CountTrue(d, mask);
1473}
1474
1475// ------------------------------ CompressBlendedStore
1476template <typename T>
1478 T* HWY_RESTRICT unaligned) {
1479 if (!mask.bits) return 0;
1480 StoreU(v, d, unaligned);
1481 return 1;
1482}
1483
1484// ------------------------------ CompressBits
1485template <typename T>
1487 return v;
1488}
1489
1490// ------------------------------ CompressBitsStore
1491template <typename T>
1492HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits,
1493 Sisd<T> d, T* HWY_RESTRICT unaligned) {
1494 const Mask1<T> mask = LoadMaskBits(d, bits);
1495 StoreU(Compress(v, mask), d, unaligned);
1496 return CountTrue(d, mask);
1497}
1498
1499// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
1500
1504 const Vec1<float> sum0,
1505 Vec1<float>& /* sum1 */) {
1506 return MulAdd(Vec1<float>(F32FromBF16(a.raw)),
1507 Vec1<float>(F32FromBF16(b.raw)), sum0);
1508}
1509
1510// ================================================== REDUCTIONS
1511
1512// Sum of all lanes, i.e. the only one.
1513template <typename T>
1515 return v;
1516}
1517template <typename T>
1519 return v;
1520}
1521template <typename T>
1523 return v;
1524}
1525
1526// ================================================== Operator wrapper
1527
1528template <class V>
1529HWY_API V Add(V a, V b) {
1530 return a + b;
1531}
1532template <class V>
1533HWY_API V Sub(V a, V b) {
1534 return a - b;
1535}
1536
1537template <class V>
1538HWY_API V Mul(V a, V b) {
1539 return a * b;
1540}
1541template <class V>
1542HWY_API V Div(V a, V b) {
1543 return a / b;
1544}
1545
1546template <class V>
1547V Shl(V a, V b) {
1548 return a << b;
1549}
1550template <class V>
1551V Shr(V a, V b) {
1552 return a >> b;
1553}
1554
1555template <class V>
1556HWY_API auto Eq(V a, V b) -> decltype(a == b) {
1557 return a == b;
1558}
1559template <class V>
1560HWY_API auto Ne(V a, V b) -> decltype(a == b) {
1561 return a != b;
1562}
1563template <class V>
1564HWY_API auto Lt(V a, V b) -> decltype(a == b) {
1565 return a < b;
1566}
1567
1568template <class V>
1569HWY_API auto Gt(V a, V b) -> decltype(a == b) {
1570 return a > b;
1571}
1572template <class V>
1573HWY_API auto Ge(V a, V b) -> decltype(a == b) {
1574 return a >= b;
1575}
1576
1577template <class V>
1578HWY_API auto Le(V a, V b) -> decltype(a == b) {
1579 return a <= b;
1580}
1581
1582// NOLINTNEXTLINE(google-readability-namespace-comments)
1583} // namespace HWY_NAMESPACE
1584} // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DASSERT(condition)
Definition: base.h:191
Definition: scalar-inl.h:68
Raw bits
Definition: scalar-inl.h:78
hwy::MakeUnsigned< T > Raw
Definition: scalar-inl.h:69
static HWY_INLINE Mask1< T > FromBool(bool b)
Definition: scalar-inl.h:72
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
V Ceiling(const V v)
Definition: scalar-inl.h:769
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:831
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:839
constexpr float HighestValue< float >()
Definition: base.h:580
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
constexpr float LowestValue< float >()
Definition: base.h:567
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: scalar-inl.h:1289
MakeSigned< T > raw
Definition: scalar-inl.h:1290
Definition: ops/shared-inl.h:40
Definition: scalar-inl.h:35
T raw
Definition: scalar-inl.h:63
HWY_INLINE Vec1 & operator*=(const Vec1 other)
Definition: scalar-inl.h:41
Vec1(const Vec1 &)=default
HWY_INLINE Vec1()=default
HWY_INLINE Vec1 & operator^=(const Vec1 other)
Definition: scalar-inl.h:59
HWY_INLINE Vec1(const T t)
Definition: scalar-inl.h:39
HWY_INLINE Vec1 & operator&=(const Vec1 other)
Definition: scalar-inl.h:53
Vec1 & operator=(const Vec1 &)=default
HWY_INLINE Vec1 & operator-=(const Vec1 other)
Definition: scalar-inl.h:50
HWY_INLINE Vec1 & operator+=(const Vec1 other)
Definition: scalar-inl.h:47
HWY_INLINE Vec1 & operator|=(const Vec1 other)
Definition: scalar-inl.h:56
HWY_INLINE Vec1 & operator/=(const Vec1 other)
Definition: scalar-inl.h:44
Definition: scalar-inl.h:84
Sisd< T > operator()(Vec1< T >) const
Definition: scalar-inl.h:86
HWY_INLINE Vec1< T > operator()(const Vec1< T > v) const
Definition: scalar-inl.h:393
Definition: emu128-inl.h:438
HWY_INLINE Vec1< T > operator()(const Vec1< T > v) const
Definition: scalar-inl.h:385