Grok 10.0.3
wasm_128-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 128-bit WASM vectors and operations.
17// External include guard in highway.h - see comment there.
18
19#include <stddef.h>
20#include <stdint.h>
21#include <wasm_simd128.h>
22
23#include "hwy/base.h"
24#include "hwy/ops/shared-inl.h"
25
26#ifdef HWY_WASM_OLD_NAMES
27#define wasm_i8x16_shuffle wasm_v8x16_shuffle
28#define wasm_i16x8_shuffle wasm_v16x8_shuffle
29#define wasm_i32x4_shuffle wasm_v32x4_shuffle
30#define wasm_i64x2_shuffle wasm_v64x2_shuffle
31#define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
32#define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
33#define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
34#define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
35#define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
36#define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
37#define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
38#define wasm_u8x16_add_sat wasm_u8x16_add_saturate
39#define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
40#define wasm_u16x8_add_sat wasm_u16x8_add_saturate
41#define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
42#define wasm_i8x16_add_sat wasm_i8x16_add_saturate
43#define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
44#define wasm_i16x8_add_sat wasm_i16x8_add_saturate
45#define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
46#endif
47
49namespace hwy {
50namespace HWY_NAMESPACE {
51
52namespace detail {
53
54template <typename T>
55struct Raw128 {
56 using type = __v128_u;
57};
58template <>
59struct Raw128<float> {
60 using type = __f32x4;
61};
62
63} // namespace detail
64
65template <typename T, size_t N = 16 / sizeof(T)>
66class Vec128 {
67 using Raw = typename detail::Raw128<T>::type;
68
69 public:
70 // Compound assignment. Only usable if there is a corresponding non-member
71 // binary operator overload. For example, only f32 and f64 support division.
73 return *this = (*this * other);
74 }
76 return *this = (*this / other);
77 }
79 return *this = (*this + other);
80 }
82 return *this = (*this - other);
83 }
85 return *this = (*this & other);
86 }
88 return *this = (*this | other);
89 }
91 return *this = (*this ^ other);
92 }
93
94 Raw raw;
95};
96
97template <typename T>
98using Vec64 = Vec128<T, 8 / sizeof(T)>;
99
100template <typename T>
101using Vec32 = Vec128<T, 4 / sizeof(T)>;
102
103// FF..FF or 0.
104template <typename T, size_t N = 16 / sizeof(T)>
105struct Mask128 {
107};
108
109namespace detail {
110
111// Deduce Simd<T, N, 0> from Vec128<T, N>
112struct DeduceD {
113 template <typename T, size_t N>
115 return Simd<T, N, 0>();
116 }
117};
118
119} // namespace detail
120
121template <class V>
122using DFromV = decltype(detail::DeduceD()(V()));
123
124template <class V>
125using TFromV = TFromD<DFromV<V>>;
126
127// ------------------------------ BitCast
128
129namespace detail {
130
131HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; }
132HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) {
133 return static_cast<__v128_u>(v);
134}
135HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) {
136 return static_cast<__v128_u>(v);
137}
138
139template <typename T, size_t N>
141 return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
142}
143
144// Cannot rely on function overloading because return types differ.
145template <typename T>
147 HWY_INLINE __v128_u operator()(__v128_u v) { return v; }
148};
149template <>
151 HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); }
152};
153
154template <typename T, size_t N>
156 Vec128<uint8_t, N * sizeof(T)> v) {
158}
159
160} // namespace detail
161
162template <typename T, size_t N, typename FromT>
163HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
164 Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
166}
167
168// ------------------------------ Zero
169
170// Returns an all-zero vector/part.
171template <typename T, size_t N, HWY_IF_LE128(T, N)>
173 return Vec128<T, N>{wasm_i32x4_splat(0)};
174}
175template <size_t N, HWY_IF_LE128(float, N)>
177 return Vec128<float, N>{wasm_f32x4_splat(0.0f)};
178}
179
180template <class D>
181using VFromD = decltype(Zero(D()));
182
183// ------------------------------ Set
184
185// Returns a vector/part with all lanes set to "t".
186template <size_t N, HWY_IF_LE128(uint8_t, N)>
188 return Vec128<uint8_t, N>{wasm_i8x16_splat(static_cast<int8_t>(t))};
189}
190template <size_t N, HWY_IF_LE128(uint16_t, N)>
192 const uint16_t t) {
193 return Vec128<uint16_t, N>{wasm_i16x8_splat(static_cast<int16_t>(t))};
194}
195template <size_t N, HWY_IF_LE128(uint32_t, N)>
197 const uint32_t t) {
198 return Vec128<uint32_t, N>{wasm_i32x4_splat(static_cast<int32_t>(t))};
199}
200template <size_t N, HWY_IF_LE128(uint64_t, N)>
202 const uint64_t t) {
203 return Vec128<uint64_t, N>{wasm_i64x2_splat(static_cast<int64_t>(t))};
204}
205
206template <size_t N, HWY_IF_LE128(int8_t, N)>
208 return Vec128<int8_t, N>{wasm_i8x16_splat(t)};
209}
210template <size_t N, HWY_IF_LE128(int16_t, N)>
212 return Vec128<int16_t, N>{wasm_i16x8_splat(t)};
213}
214template <size_t N, HWY_IF_LE128(int32_t, N)>
216 return Vec128<int32_t, N>{wasm_i32x4_splat(t)};
217}
218template <size_t N, HWY_IF_LE128(int64_t, N)>
220 return Vec128<int64_t, N>{wasm_i64x2_splat(t)};
221}
222
223template <size_t N, HWY_IF_LE128(float, N)>
225 return Vec128<float, N>{wasm_f32x4_splat(t)};
226}
227
228HWY_DIAGNOSTICS(push)
229HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
230
231// Returns a vector with uninitialized elements.
232template <typename T, size_t N, HWY_IF_LE128(T, N)>
234 return Zero(d);
235}
236
238
239// Returns a vector with lane i=[0, N) set to "first" + i.
240template <typename T, size_t N, typename T2>
241Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
242 HWY_ALIGN T lanes[16 / sizeof(T)];
243 for (size_t i = 0; i < 16 / sizeof(T); ++i) {
244 lanes[i] = static_cast<T>(first + static_cast<T2>(i));
245 }
246 return Load(d, lanes);
247}
248
249// ================================================== ARITHMETIC
250
251// ------------------------------ Addition
252
253// Unsigned
254template <size_t N>
256 const Vec128<uint8_t, N> b) {
257 return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
258}
259template <size_t N>
261 const Vec128<uint16_t, N> b) {
262 return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
263}
264template <size_t N>
266 const Vec128<uint32_t, N> b) {
267 return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
268}
269template <size_t N>
271 const Vec128<uint64_t, N> b) {
272 return Vec128<uint64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
273}
274
275// Signed
276template <size_t N>
278 const Vec128<int8_t, N> b) {
279 return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
280}
281template <size_t N>
283 const Vec128<int16_t, N> b) {
284 return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
285}
286template <size_t N>
288 const Vec128<int32_t, N> b) {
289 return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
290}
291template <size_t N>
293 const Vec128<int64_t, N> b) {
294 return Vec128<int64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
295}
296
297// Float
298template <size_t N>
300 const Vec128<float, N> b) {
301 return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)};
302}
303
304// ------------------------------ Subtraction
305
306// Unsigned
307template <size_t N>
309 const Vec128<uint8_t, N> b) {
310 return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
311}
312template <size_t N>
315 return Vec128<uint16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
316}
317template <size_t N>
319 const Vec128<uint32_t, N> b) {
320 return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
321}
322template <size_t N>
324 const Vec128<uint64_t, N> b) {
325 return Vec128<uint64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
326}
327
328// Signed
329template <size_t N>
331 const Vec128<int8_t, N> b) {
332 return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
333}
334template <size_t N>
336 const Vec128<int16_t, N> b) {
337 return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
338}
339template <size_t N>
341 const Vec128<int32_t, N> b) {
342 return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
343}
344template <size_t N>
346 const Vec128<int64_t, N> b) {
347 return Vec128<int64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
348}
349
350// Float
351template <size_t N>
353 const Vec128<float, N> b) {
354 return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)};
355}
356
357// ------------------------------ SaturatedAdd
358
359// Returns a + b clamped to the destination range.
360
361// Unsigned
362template <size_t N>
364 const Vec128<uint8_t, N> b) {
365 return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
366}
367template <size_t N>
369 const Vec128<uint16_t, N> b) {
370 return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
371}
372
373// Signed
374template <size_t N>
376 const Vec128<int8_t, N> b) {
377 return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
378}
379template <size_t N>
381 const Vec128<int16_t, N> b) {
382 return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
383}
384
385// ------------------------------ SaturatedSub
386
387// Returns a - b clamped to the destination range.
388
389// Unsigned
390template <size_t N>
392 const Vec128<uint8_t, N> b) {
393 return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
394}
395template <size_t N>
397 const Vec128<uint16_t, N> b) {
398 return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
399}
400
401// Signed
402template <size_t N>
404 const Vec128<int8_t, N> b) {
405 return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
406}
407template <size_t N>
409 const Vec128<int16_t, N> b) {
410 return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
411}
412
413// ------------------------------ Average
414
415// Returns (a + b + 1) / 2
416
417// Unsigned
418template <size_t N>
420 const Vec128<uint8_t, N> b) {
421 return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)};
422}
423template <size_t N>
425 const Vec128<uint16_t, N> b) {
426 return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
427}
428
429// ------------------------------ Absolute value
430
431// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
432template <size_t N>
434 return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)};
435}
436template <size_t N>
438 return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)};
439}
440template <size_t N>
442 return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
443}
444template <size_t N>
446 return Vec128<int64_t, N>{wasm_i64x2_abs(v.raw)};
447}
448
449template <size_t N>
451 return Vec128<float, N>{wasm_f32x4_abs(v.raw)};
452}
453
454// ------------------------------ Shift lanes by constant #bits
455
456// Unsigned
457template <int kBits, size_t N>
459 return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
460}
461template <int kBits, size_t N>
463 return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)};
464}
465template <int kBits, size_t N>
467 return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
468}
469template <int kBits, size_t N>
471 return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
472}
473template <int kBits, size_t N>
475 return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)};
476}
477template <int kBits, size_t N>
479 return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, kBits)};
480}
481
482// Signed
483template <int kBits, size_t N>
485 return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
486}
487template <int kBits, size_t N>
489 return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)};
490}
491template <int kBits, size_t N>
493 return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
494}
495template <int kBits, size_t N>
497 return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
498}
499template <int kBits, size_t N>
501 return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
502}
503template <int kBits, size_t N>
505 return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, kBits)};
506}
507
508// 8-bit
509template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
511 const DFromV<decltype(v)> d8;
512 // Use raw instead of BitCast to support N=1.
513 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
514 return kBits == 1
515 ? (v + v)
516 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
517}
518
519template <int kBits, size_t N>
521 const DFromV<decltype(v)> d8;
522 // Use raw instead of BitCast to support N=1.
523 const Vec128<uint8_t, N> shifted{
524 ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
525 return shifted & Set(d8, 0xFF >> kBits);
526}
527
528template <int kBits, size_t N>
530 const DFromV<decltype(v)> di;
531 const RebindToUnsigned<decltype(di)> du;
532 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
533 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
534 return (shifted ^ shifted_sign) - shifted_sign;
535}
536
537// ------------------------------ RotateRight (ShiftRight, Or)
538template <int kBits, typename T, size_t N>
539HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
540 constexpr size_t kSizeInBits = sizeof(T) * 8;
541 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
542 if (kBits == 0) return v;
543 return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
544}
545
546// ------------------------------ Shift lanes by same variable #bits
547
548// After https://reviews.llvm.org/D108415 shift argument became unsigned.
549HWY_DIAGNOSTICS(push)
550HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
551
552// Unsigned
553template <size_t N>
555 const int bits) {
556 return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)};
557}
558template <size_t N>
560 const int bits) {
561 return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)};
562}
563template <size_t N>
565 const int bits) {
566 return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)};
567}
568template <size_t N>
570 const int bits) {
571 return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
572}
573template <size_t N>
575 const int bits) {
576 return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, bits)};
577}
578template <size_t N>
580 const int bits) {
581 return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, bits)};
582}
583
584// Signed
585template <size_t N>
587 const int bits) {
588 return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)};
589}
590template <size_t N>
592 const int bits) {
593 return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)};
594}
595template <size_t N>
597 const int bits) {
598 return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)};
599}
600template <size_t N>
602 const int bits) {
603 return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
604}
605template <size_t N>
607 const int bits) {
608 return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, bits)};
609}
610template <size_t N>
612 const int bits) {
613 return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, bits)};
614}
615
616// 8-bit
617template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
619 const DFromV<decltype(v)> d8;
620 // Use raw instead of BitCast to support N=1.
621 const Vec128<T, N> shifted{
622 ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
623 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
624}
625
626template <size_t N>
628 const int bits) {
629 const DFromV<decltype(v)> d8;
630 // Use raw instead of BitCast to support N=1.
631 const Vec128<uint8_t, N> shifted{
632 ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
633 return shifted & Set(d8, 0xFF >> bits);
634}
635
636template <size_t N>
638 const DFromV<decltype(v)> di;
639 const RebindToUnsigned<decltype(di)> du;
640 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
641 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
642 return (shifted ^ shifted_sign) - shifted_sign;
643}
644
645// ignore Wsign-conversion
647
648// ------------------------------ Minimum
649
650// Unsigned
651template <size_t N>
653 return Vec128<uint8_t, N>{wasm_u8x16_min(a.raw, b.raw)};
654}
655template <size_t N>
657 return Vec128<uint16_t, N>{wasm_u16x8_min(a.raw, b.raw)};
658}
659template <size_t N>
661 return Vec128<uint32_t, N>{wasm_u32x4_min(a.raw, b.raw)};
662}
663template <size_t N>
664HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
665 // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
666 const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
667 const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
668 const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
669 const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
670 alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)};
671 return Vec128<uint64_t, N>{wasm_v128_load(min)};
672}
673
674// Signed
675template <size_t N>
677 return Vec128<int8_t, N>{wasm_i8x16_min(a.raw, b.raw)};
678}
679template <size_t N>
681 return Vec128<int16_t, N>{wasm_i16x8_min(a.raw, b.raw)};
682}
683template <size_t N>
685 return Vec128<int32_t, N>{wasm_i32x4_min(a.raw, b.raw)};
686}
687template <size_t N>
688HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
689 alignas(16) int64_t min[4];
690 min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
691 wasm_i64x2_extract_lane(b.raw, 0));
692 min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
693 wasm_i64x2_extract_lane(b.raw, 1));
694 return Vec128<int64_t, N>{wasm_v128_load(min)};
695}
696
697// Float
698template <size_t N>
700 return Vec128<float, N>{wasm_f32x4_min(a.raw, b.raw)};
701}
702
703// ------------------------------ Maximum
704
705// Unsigned
706template <size_t N>
708 return Vec128<uint8_t, N>{wasm_u8x16_max(a.raw, b.raw)};
709}
710template <size_t N>
712 return Vec128<uint16_t, N>{wasm_u16x8_max(a.raw, b.raw)};
713}
714template <size_t N>
716 return Vec128<uint32_t, N>{wasm_u32x4_max(a.raw, b.raw)};
717}
718template <size_t N>
719HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
720 // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
721 const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
722 const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
723 const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
724 const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
725 alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)};
726 return Vec128<uint64_t, N>{wasm_v128_load(max)};
727}
728
729// Signed
730template <size_t N>
732 return Vec128<int8_t, N>{wasm_i8x16_max(a.raw, b.raw)};
733}
734template <size_t N>
736 return Vec128<int16_t, N>{wasm_i16x8_max(a.raw, b.raw)};
737}
738template <size_t N>
740 return Vec128<int32_t, N>{wasm_i32x4_max(a.raw, b.raw)};
741}
742template <size_t N>
743HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
744 alignas(16) int64_t max[2];
745 max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
746 wasm_i64x2_extract_lane(b.raw, 0));
747 max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
748 wasm_i64x2_extract_lane(b.raw, 1));
749 return Vec128<int64_t, N>{wasm_v128_load(max)};
750}
751
752// Float
753template <size_t N>
755 return Vec128<float, N>{wasm_f32x4_max(a.raw, b.raw)};
756}
757
758// ------------------------------ Integer multiplication
759
760// Unsigned
761template <size_t N>
763 const Vec128<uint16_t, N> b) {
764 return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
765}
766template <size_t N>
768 const Vec128<uint32_t, N> b) {
769 return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
770}
771
772// Signed
773template <size_t N>
775 const Vec128<int16_t, N> b) {
776 return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
777}
778template <size_t N>
780 const Vec128<int32_t, N> b) {
781 return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
782}
783
784// Returns the upper 16 bits of a * b in each lane.
785template <size_t N>
786HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
787 const Vec128<uint16_t, N> b) {
788 // TODO(eustas): replace, when implemented in WASM.
789 const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
790 const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
791 const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
792 const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
793 const auto l = wasm_i32x4_mul(al, bl);
794 const auto h = wasm_i32x4_mul(ah, bh);
795 // TODO(eustas): shift-right + narrow?
796 return Vec128<uint16_t, N>{
797 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
798}
799template <size_t N>
800HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
801 const Vec128<int16_t, N> b) {
802 // TODO(eustas): replace, when implemented in WASM.
803 const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
804 const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
805 const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
806 const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
807 const auto l = wasm_i32x4_mul(al, bl);
808 const auto h = wasm_i32x4_mul(ah, bh);
809 // TODO(eustas): shift-right + narrow?
810 return Vec128<int16_t, N>{
811 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
812}
813
814template <size_t N>
815HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
816 Vec128<int16_t, N> b) {
817 const DFromV<decltype(a)> d;
818 const RebindToUnsigned<decltype(d)> du;
819
820 const Vec128<uint16_t, N> lo = BitCast(du, Mul(a, b));
821 const Vec128<int16_t, N> hi = MulHigh(a, b);
822 // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must
823 // carry that into the result. Instead isolate the top two bits because only
824 // they can influence the result.
825 const Vec128<uint16_t, N> lo_top2 = ShiftRight<14>(lo);
826 // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0.
827 const Vec128<uint16_t, N> rounding = ShiftRight<1>(Add(lo_top2, Set(du, 1)));
828 return Add(Add(hi, hi), BitCast(d, rounding));
829}
830
831// Multiplies even lanes (0, 2 ..) and returns the double-width result.
832template <size_t N>
833HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
834 const Vec128<int32_t, N> b) {
835 // TODO(eustas): replace, when implemented in WASM.
836 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
837 const auto ae = wasm_v128_and(a.raw, kEvenMask);
838 const auto be = wasm_v128_and(b.raw, kEvenMask);
839 return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
840}
841template <size_t N>
842HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
843 const Vec128<uint32_t, N> b) {
844 // TODO(eustas): replace, when implemented in WASM.
845 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
846 const auto ae = wasm_v128_and(a.raw, kEvenMask);
847 const auto be = wasm_v128_and(b.raw, kEvenMask);
848 return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
849}
850
851// ------------------------------ Negate
852
853template <typename T, size_t N, HWY_IF_FLOAT(T)>
854HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
855 return Xor(v, SignBit(DFromV<decltype(v)>()));
856}
857
858template <size_t N>
860 return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
861}
862template <size_t N>
864 return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
865}
866template <size_t N>
868 return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
869}
870template <size_t N>
872 return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
873}
874
875// ------------------------------ Floating-point mul / div
876
877template <size_t N>
879 return Vec128<float, N>{wasm_f32x4_mul(a.raw, b.raw)};
880}
881
882template <size_t N>
883HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
884 const Vec128<float, N> b) {
885 return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
886}
887
888// Approximate reciprocal
889template <size_t N>
890HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
891 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
892 return one / v;
893}
894
895// Absolute value of difference.
896template <size_t N>
897HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
898 const Vec128<float, N> b) {
899 return Abs(a - b);
900}
901
902// ------------------------------ Floating-point multiply-add variants
903
904// Returns mul * x + add
905template <size_t N>
906HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
907 const Vec128<float, N> x,
908 const Vec128<float, N> add) {
909 // TODO(eustas): replace, when implemented in WASM.
910 // TODO(eustas): is it wasm_f32x4_qfma?
911 return mul * x + add;
912}
913
914// Returns add - mul * x
915template <size_t N>
916HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
917 const Vec128<float, N> x,
918 const Vec128<float, N> add) {
919 // TODO(eustas): replace, when implemented in WASM.
920 return add - mul * x;
921}
922
923// Returns mul * x - sub
924template <size_t N>
925HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
926 const Vec128<float, N> x,
927 const Vec128<float, N> sub) {
928 // TODO(eustas): replace, when implemented in WASM.
929 // TODO(eustas): is it wasm_f32x4_qfms?
930 return mul * x - sub;
931}
932
933// Returns -mul * x - sub
934template <size_t N>
935HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
936 const Vec128<float, N> x,
937 const Vec128<float, N> sub) {
938 // TODO(eustas): replace, when implemented in WASM.
939 return Neg(mul) * x - sub;
940}
941
942// ------------------------------ Floating-point square root
943
944// Full precision square root
945template <size_t N>
946HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
947 return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)};
948}
949
950// Approximate reciprocal square root
951template <size_t N>
952HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
953 // TODO(eustas): find cheaper a way to calculate this.
954 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
955 return one / Sqrt(v);
956}
957
958// ------------------------------ Floating-point rounding
959
960// Toward nearest integer, ties to even
961template <size_t N>
962HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
963 return Vec128<float, N>{wasm_f32x4_nearest(v.raw)};
964}
965
966// Toward zero, aka truncate
967template <size_t N>
968HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
969 return Vec128<float, N>{wasm_f32x4_trunc(v.raw)};
970}
971
972// Toward +infinity, aka ceiling
973template <size_t N>
974HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
975 return Vec128<float, N>{wasm_f32x4_ceil(v.raw)};
976}
977
978// Toward -infinity, aka floor
979template <size_t N>
980HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
981 return Vec128<float, N>{wasm_f32x4_floor(v.raw)};
982}
983
984// ------------------------------ Floating-point classification
985template <typename T, size_t N>
986HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
987 return v != v;
988}
989
990template <typename T, size_t N, HWY_IF_FLOAT(T)>
991HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
992 const Simd<T, N, 0> d;
993 const RebindToSigned<decltype(d)> di;
994 const VFromD<decltype(di)> vi = BitCast(di, v);
995 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
996 return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
997}
998
999// Returns whether normal/subnormal/zero.
1000template <typename T, size_t N, HWY_IF_FLOAT(T)>
1001HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
1002 const Simd<T, N, 0> d;
1003 const RebindToUnsigned<decltype(d)> du;
1004 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
1005 const VFromD<decltype(du)> vu = BitCast(du, v);
1006 // 'Shift left' to clear the sign bit, then right so we can compare with the
1007 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
1008 // negative and non-negative floats would be greater).
1009 const VFromD<decltype(di)> exp =
1010 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
1011 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
1012}
1013
1014// ================================================== COMPARE
1015
1016// Comparisons fill a lane with 1-bits if the condition is true, else 0.
1017
1018template <typename TFrom, typename TTo, size_t N>
1019HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
1020 Mask128<TFrom, N> m) {
1021 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1022 return Mask128<TTo, N>{m.raw};
1023}
1024
1025template <typename T, size_t N>
1026HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
1027 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1028 return (v & bit) == bit;
1029}
1030
1031// ------------------------------ Equality
1032
1033// Unsigned
1034template <size_t N>
1036 const Vec128<uint8_t, N> b) {
1037 return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
1038}
1039template <size_t N>
1041 const Vec128<uint16_t, N> b) {
1042 return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
1043}
1044template <size_t N>
1046 const Vec128<uint32_t, N> b) {
1047 return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
1048}
1049template <size_t N>
1051 const Vec128<uint64_t, N> b) {
1052 return Mask128<uint64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
1053}
1054
1055// Signed
1056template <size_t N>
1058 const Vec128<int8_t, N> b) {
1059 return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
1060}
1061template <size_t N>
1064 return Mask128<int16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
1065}
1066template <size_t N>
1068 const Vec128<int32_t, N> b) {
1069 return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
1070}
1071template <size_t N>
1073 const Vec128<int64_t, N> b) {
1074 return Mask128<int64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
1075}
1076
1077// Float
1078template <size_t N>
1080 const Vec128<float, N> b) {
1081 return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)};
1082}
1083
1084// ------------------------------ Inequality
1085
1086// Unsigned
1087template <size_t N>
1089 const Vec128<uint8_t, N> b) {
1090 return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
1091}
1092template <size_t N>
1094 const Vec128<uint16_t, N> b) {
1095 return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
1096}
1097template <size_t N>
1099 const Vec128<uint32_t, N> b) {
1100 return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
1101}
1102template <size_t N>
1104 const Vec128<uint64_t, N> b) {
1105 return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
1106}
1107
1108// Signed
1109template <size_t N>
1111 const Vec128<int8_t, N> b) {
1112 return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
1113}
1114template <size_t N>
1116 const Vec128<int16_t, N> b) {
1117 return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
1118}
1119template <size_t N>
1121 const Vec128<int32_t, N> b) {
1122 return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
1123}
1124template <size_t N>
1126 const Vec128<int64_t, N> b) {
1127 return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
1128}
1129
1130// Float
1131template <size_t N>
1133 const Vec128<float, N> b) {
1134 return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)};
1135}
1136
1137// ------------------------------ Strict inequality
1138
1139template <size_t N>
1141 const Vec128<int8_t, N> b) {
1142 return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)};
1143}
1144template <size_t N>
1146 const Vec128<int16_t, N> b) {
1147 return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)};
1148}
1149template <size_t N>
1151 const Vec128<int32_t, N> b) {
1152 return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)};
1153}
1154template <size_t N>
1156 const Vec128<int64_t, N> b) {
1157 return Mask128<int64_t, N>{wasm_i64x2_gt(a.raw, b.raw)};
1158}
1159
1160template <size_t N>
1162 const Vec128<uint8_t, N> b) {
1163 return Mask128<uint8_t, N>{wasm_u8x16_gt(a.raw, b.raw)};
1164}
1165template <size_t N>
1167 const Vec128<uint16_t, N> b) {
1168 return Mask128<uint16_t, N>{wasm_u16x8_gt(a.raw, b.raw)};
1169}
1170template <size_t N>
1172 const Vec128<uint32_t, N> b) {
1173 return Mask128<uint32_t, N>{wasm_u32x4_gt(a.raw, b.raw)};
1174}
1175template <size_t N>
1177 const Vec128<uint64_t, N> b) {
1178 const DFromV<decltype(a)> d;
1179 const Repartition<uint32_t, decltype(d)> d32;
1180 const auto a32 = BitCast(d32, a);
1181 const auto b32 = BitCast(d32, b);
1182 // If the upper halves are not equal, this is the answer.
1183 const auto m_gt = a32 > b32;
1184
1185 // Otherwise, the lower half decides.
1186 const auto m_eq = a32 == b32;
1187 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
1188 const auto lo_gt = And(m_eq, MaskFromVec(VFromD<decltype(d32)>{lo_in_hi}));
1189
1190 const auto gt = Or(lo_gt, m_gt);
1191 // Copy result in upper 32 bits to lower 32 bits.
1192 return Mask128<uint64_t, N>{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)};
1193}
1194
1195template <size_t N>
1197 const Vec128<float, N> b) {
1198 return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)};
1199}
1200
1201template <typename T, size_t N>
1202HWY_API Mask128<T, N> operator<(const Vec128<T, N> a, const Vec128<T, N> b) {
1203 return operator>(b, a);
1204}
1205
1206// ------------------------------ Weak inequality
1207
1208// Float <= >=
1209template <size_t N>
1211 const Vec128<float, N> b) {
1212 return Mask128<float, N>{wasm_f32x4_le(a.raw, b.raw)};
1213}
1214template <size_t N>
1216 const Vec128<float, N> b) {
1217 return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
1218}
1219
1220// ------------------------------ FirstN (Iota, Lt)
1221
1222template <typename T, size_t N>
1223HWY_API Mask128<T, N> FirstN(const Simd<T, N, 0> d, size_t num) {
1224 const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
1225 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
1226}
1227
1228// ================================================== LOGICAL
1229
1230// ------------------------------ Not
1231
1232template <typename T, size_t N>
1233HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
1234 return Vec128<T, N>{wasm_v128_not(v.raw)};
1235}
1236
1237// ------------------------------ And
1238
1239template <typename T, size_t N>
1240HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
1241 return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
1242}
1243
1244// ------------------------------ AndNot
1245
1246// Returns ~not_mask & mask.
1247template <typename T, size_t N>
1248HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
1249 return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
1250}
1251
1252// ------------------------------ Or
1253
1254template <typename T, size_t N>
1255HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
1256 return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
1257}
1258
1259// ------------------------------ Xor
1260
1261template <typename T, size_t N>
1262HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
1263 return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
1264}
1265
1266// ------------------------------ Or3
1267
1268template <typename T, size_t N>
1269HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
1270 return Or(o1, Or(o2, o3));
1271}
1272
1273// ------------------------------ OrAnd
1274
1275template <typename T, size_t N>
1276HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
1277 return Or(o, And(a1, a2));
1278}
1279
1280// ------------------------------ IfVecThenElse
1281
1282template <typename T, size_t N>
1283HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
1284 Vec128<T, N> no) {
1285 return IfThenElse(MaskFromVec(mask), yes, no);
1286}
1287
1288// ------------------------------ Operator overloads (internal-only if float)
1289
1290template <typename T, size_t N>
1291HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
1292 return And(a, b);
1293}
1294
1295template <typename T, size_t N>
1296HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
1297 return Or(a, b);
1298}
1299
1300template <typename T, size_t N>
1301HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
1302 return Xor(a, b);
1303}
1304
1305// ------------------------------ CopySign
1306
1307template <typename T, size_t N>
1308HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
1309 const Vec128<T, N> sign) {
1310 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1311 const auto msb = SignBit(DFromV<decltype(magn)>());
1312 return Or(AndNot(msb, magn), And(msb, sign));
1313}
1314
1315template <typename T, size_t N>
1316HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
1317 const Vec128<T, N> sign) {
1318 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1319 return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
1320}
1321
1322// ------------------------------ BroadcastSignBit (compare)
1323
1324template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
1325HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
1326 return ShiftRight<sizeof(T) * 8 - 1>(v);
1327}
1328template <size_t N>
1330 const DFromV<decltype(v)> d;
1331 return VecFromMask(d, v < Zero(d));
1332}
1333
1334// ------------------------------ Mask
1335
1336// Mask and Vec are the same (true = FF..FF).
1337template <typename T, size_t N>
1338HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1339 return Mask128<T, N>{v.raw};
1340}
1341
1342template <typename T, size_t N>
1343HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */, Mask128<T, N> v) {
1344 return Vec128<T, N>{v.raw};
1345}
1346
1347// mask ? yes : no
1348template <typename T, size_t N>
1349HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1350 Vec128<T, N> no) {
1351 return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
1352}
1353
1354// mask ? yes : 0
1355template <typename T, size_t N>
1356HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
1357 return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
1358}
1359
1360// mask ? 0 : no
1361template <typename T, size_t N>
1362HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
1363 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
1364}
1365
1366template <typename T, size_t N>
1367HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
1368 Vec128<T, N> no) {
1369 static_assert(IsSigned<T>(), "Only works for signed/float");
1370 const DFromV<decltype(v)> d;
1371 const RebindToSigned<decltype(d)> di;
1372
1373 v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
1374 return IfThenElse(MaskFromVec(v), yes, no);
1375}
1376
1377template <typename T, size_t N, HWY_IF_FLOAT(T)>
1379 const DFromV<decltype(v)> d;
1380 const auto zero = Zero(d);
1381 return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
1382}
1383
1384// ------------------------------ Mask logical
1385
1386template <typename T, size_t N>
1387HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1388 return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
1389}
1390
1391template <typename T, size_t N>
1392HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1393 const Simd<T, N, 0> d;
1394 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1395}
1396
1397template <typename T, size_t N>
1398HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1399 const Simd<T, N, 0> d;
1400 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1401}
1402
1403template <typename T, size_t N>
1404HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1405 const Simd<T, N, 0> d;
1406 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1407}
1408
1409template <typename T, size_t N>
1410HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1411 const Simd<T, N, 0> d;
1412 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1413}
1414
1415// ------------------------------ Shl (BroadcastSignBit, IfThenElse)
1416
1417// The x86 multiply-by-Pow2() trick will not work because WASM saturates
1418// float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
1419// scalar count operand, per-lane shift instructions would require extract_lane
1420// for each lane, and hoping that shuffle is correctly mapped to a native
1421// instruction. Using non-vector shifts would incur a store-load forwarding
1422// stall when loading the result vector. We instead test bits of the shift
1423// count to "predicate" a shift of the entire vector by a constant.
1424
1425template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1427 const DFromV<decltype(v)> d;
1428 Mask128<T, N> mask;
1429 // Need a signed type for BroadcastSignBit.
1430 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1431 // Move the highest valid bit of the shift count into the sign bit.
1432 test = ShiftLeft<12>(test);
1433
1434 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1435 test = ShiftLeft<1>(test); // next bit (descending order)
1436 v = IfThenElse(mask, ShiftLeft<8>(v), v);
1437
1438 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1439 test = ShiftLeft<1>(test); // next bit (descending order)
1440 v = IfThenElse(mask, ShiftLeft<4>(v), v);
1441
1442 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1443 test = ShiftLeft<1>(test); // next bit (descending order)
1444 v = IfThenElse(mask, ShiftLeft<2>(v), v);
1445
1446 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1447 return IfThenElse(mask, ShiftLeft<1>(v), v);
1448}
1449
1450template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1451HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
1452 const DFromV<decltype(v)> d;
1453 Mask128<T, N> mask;
1454 // Need a signed type for BroadcastSignBit.
1455 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1456 // Move the highest valid bit of the shift count into the sign bit.
1457 test = ShiftLeft<27>(test);
1458
1459 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1460 test = ShiftLeft<1>(test); // next bit (descending order)
1461 v = IfThenElse(mask, ShiftLeft<16>(v), v);
1462
1463 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1464 test = ShiftLeft<1>(test); // next bit (descending order)
1465 v = IfThenElse(mask, ShiftLeft<8>(v), v);
1466
1467 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1468 test = ShiftLeft<1>(test); // next bit (descending order)
1469 v = IfThenElse(mask, ShiftLeft<4>(v), v);
1470
1471 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1472 test = ShiftLeft<1>(test); // next bit (descending order)
1473 v = IfThenElse(mask, ShiftLeft<2>(v), v);
1474
1475 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1476 return IfThenElse(mask, ShiftLeft<1>(v), v);
1477}
1478
1479template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1480HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
1481 const DFromV<decltype(v)> d;
1482 alignas(16) T lanes[2];
1483 alignas(16) T bits_lanes[2];
1484 Store(v, d, lanes);
1485 Store(bits, d, bits_lanes);
1486 lanes[0] <<= bits_lanes[0];
1487 lanes[1] <<= bits_lanes[1];
1488 return Load(d, lanes);
1489}
1490
1491// ------------------------------ Shr (BroadcastSignBit, IfThenElse)
1492
1493template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1495 const DFromV<decltype(v)> d;
1496 Mask128<T, N> mask;
1497 // Need a signed type for BroadcastSignBit.
1498 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1499 // Move the highest valid bit of the shift count into the sign bit.
1500 test = ShiftLeft<12>(test);
1501
1502 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1503 test = ShiftLeft<1>(test); // next bit (descending order)
1504 v = IfThenElse(mask, ShiftRight<8>(v), v);
1505
1506 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1507 test = ShiftLeft<1>(test); // next bit (descending order)
1508 v = IfThenElse(mask, ShiftRight<4>(v), v);
1509
1510 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1511 test = ShiftLeft<1>(test); // next bit (descending order)
1512 v = IfThenElse(mask, ShiftRight<2>(v), v);
1513
1514 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1515 return IfThenElse(mask, ShiftRight<1>(v), v);
1516}
1517
1518template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1519HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
1520 const DFromV<decltype(v)> d;
1521 Mask128<T, N> mask;
1522 // Need a signed type for BroadcastSignBit.
1523 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1524 // Move the highest valid bit of the shift count into the sign bit.
1525 test = ShiftLeft<27>(test);
1526
1527 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1528 test = ShiftLeft<1>(test); // next bit (descending order)
1529 v = IfThenElse(mask, ShiftRight<16>(v), v);
1530
1531 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1532 test = ShiftLeft<1>(test); // next bit (descending order)
1533 v = IfThenElse(mask, ShiftRight<8>(v), v);
1534
1535 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1536 test = ShiftLeft<1>(test); // next bit (descending order)
1537 v = IfThenElse(mask, ShiftRight<4>(v), v);
1538
1539 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1540 test = ShiftLeft<1>(test); // next bit (descending order)
1541 v = IfThenElse(mask, ShiftRight<2>(v), v);
1542
1543 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1544 return IfThenElse(mask, ShiftRight<1>(v), v);
1545}
1546
1547// ================================================== MEMORY
1548
1549// ------------------------------ Load
1550
1551template <typename T>
1552HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
1553 return Vec128<T>{wasm_v128_load(aligned)};
1554}
1555
1556template <typename T, size_t N>
1557HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
1558 const T* HWY_RESTRICT aligned) {
1559 return IfThenElseZero(m, Load(d, aligned));
1560}
1561
1562// Partial load.
1563template <typename T, size_t N, HWY_IF_LE64(T, N)>
1566 CopyBytes<sizeof(T) * N>(p, &v);
1567 return v;
1568}
1569
1570// LoadU == Load.
1571template <typename T, size_t N>
1572HWY_API Vec128<T, N> LoadU(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
1573 return Load(d, p);
1574}
1575
1576// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1577template <typename T, size_t N, HWY_IF_LE128(T, N)>
1578HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
1579 return Load(d, p);
1580}
1581
1582// ------------------------------ Store
1583
1584template <typename T>
1585HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
1586 wasm_v128_store(aligned, v.raw);
1587}
1588
1589// Partial store.
1590template <typename T, size_t N, HWY_IF_LE64(T, N)>
1592 CopyBytes<sizeof(T) * N>(&v, p);
1593}
1594
1596 float* HWY_RESTRICT p) {
1597 *p = wasm_f32x4_extract_lane(v.raw, 0);
1598}
1599
1600// StoreU == Store.
1601template <typename T, size_t N>
1602HWY_API void StoreU(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT p) {
1603 Store(v, d, p);
1604}
1605
1606template <typename T, size_t N>
1607HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
1608 T* HWY_RESTRICT p) {
1609 StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
1610}
1611
1612// ------------------------------ Non-temporal stores
1613
1614// Same as aligned stores on non-x86.
1615
1616template <typename T, size_t N>
1617HWY_API void Stream(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
1618 T* HWY_RESTRICT aligned) {
1619 wasm_v128_store(aligned, v.raw);
1620}
1621
1622// ------------------------------ Scatter (Store)
1623
1624template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
1625HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
1626 T* HWY_RESTRICT base,
1627 const Vec128<Offset, N> offset) {
1628 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1629
1630 alignas(16) T lanes[N];
1631 Store(v, d, lanes);
1632
1633 alignas(16) Offset offset_lanes[N];
1634 Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
1635
1636 uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
1637 for (size_t i = 0; i < N; ++i) {
1638 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1639 }
1640}
1641
1642template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
1643HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
1644 const Vec128<Index, N> index) {
1645 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1646
1647 alignas(16) T lanes[N];
1648 Store(v, d, lanes);
1649
1650 alignas(16) Index index_lanes[N];
1651 Store(index, Rebind<Index, decltype(d)>(), index_lanes);
1652
1653 for (size_t i = 0; i < N; ++i) {
1654 base[index_lanes[i]] = lanes[i];
1655 }
1656}
1657
1658// ------------------------------ Gather (Load/Store)
1659
1660template <typename T, size_t N, typename Offset>
1661HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
1662 const T* HWY_RESTRICT base,
1663 const Vec128<Offset, N> offset) {
1664 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1665
1666 alignas(16) Offset offset_lanes[N];
1667 Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
1668
1669 alignas(16) T lanes[N];
1670 const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
1671 for (size_t i = 0; i < N; ++i) {
1672 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1673 }
1674 return Load(d, lanes);
1675}
1676
1677template <typename T, size_t N, typename Index>
1678HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
1679 const T* HWY_RESTRICT base,
1680 const Vec128<Index, N> index) {
1681 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1682
1683 alignas(16) Index index_lanes[N];
1684 Store(index, Rebind<Index, decltype(d)>(), index_lanes);
1685
1686 alignas(16) T lanes[N];
1687 for (size_t i = 0; i < N; ++i) {
1688 lanes[i] = base[index_lanes[i]];
1689 }
1690 return Load(d, lanes);
1691}
1692
1693// ================================================== SWIZZLE
1694
1695// ------------------------------ ExtractLane
1696
1697namespace detail {
1698
1699template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1701 return static_cast<T>(wasm_i8x16_extract_lane(v.raw, kLane));
1702}
1703template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1705 return static_cast<T>(wasm_i16x8_extract_lane(v.raw, kLane));
1706}
1707template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1708HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
1709 return static_cast<T>(wasm_i32x4_extract_lane(v.raw, kLane));
1710}
1711template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1712HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
1713 return static_cast<T>(wasm_i64x2_extract_lane(v.raw, kLane));
1714}
1715
1716template <size_t kLane, size_t N>
1718 return wasm_f32x4_extract_lane(v.raw, kLane);
1719}
1720
1721} // namespace detail
1722
1723// One overload per vector length just in case *_extract_lane raise compile
1724// errors if their argument is out of bounds (even if that would never be
1725// reached at runtime).
1726template <typename T>
1727HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
1728 HWY_DASSERT(i == 0);
1729 (void)i;
1730 return GetLane(v);
1731}
1732
1733template <typename T>
1734HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
1735#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1736 if (__builtin_constant_p(i)) {
1737 switch (i) {
1738 case 0:
1739 return detail::ExtractLane<0>(v);
1740 case 1:
1741 return detail::ExtractLane<1>(v);
1742 }
1743 }
1744#endif
1745 alignas(16) T lanes[2];
1746 Store(v, DFromV<decltype(v)>(), lanes);
1747 return lanes[i];
1748}
1749
1750template <typename T>
1751HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
1752#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1753 if (__builtin_constant_p(i)) {
1754 switch (i) {
1755 case 0:
1756 return detail::ExtractLane<0>(v);
1757 case 1:
1758 return detail::ExtractLane<1>(v);
1759 case 2:
1760 return detail::ExtractLane<2>(v);
1761 case 3:
1762 return detail::ExtractLane<3>(v);
1763 }
1764 }
1765#endif
1766 alignas(16) T lanes[4];
1767 Store(v, DFromV<decltype(v)>(), lanes);
1768 return lanes[i];
1769}
1770
1771template <typename T>
1772HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
1773#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1774 if (__builtin_constant_p(i)) {
1775 switch (i) {
1776 case 0:
1777 return detail::ExtractLane<0>(v);
1778 case 1:
1779 return detail::ExtractLane<1>(v);
1780 case 2:
1781 return detail::ExtractLane<2>(v);
1782 case 3:
1783 return detail::ExtractLane<3>(v);
1784 case 4:
1785 return detail::ExtractLane<4>(v);
1786 case 5:
1787 return detail::ExtractLane<5>(v);
1788 case 6:
1789 return detail::ExtractLane<6>(v);
1790 case 7:
1791 return detail::ExtractLane<7>(v);
1792 }
1793 }
1794#endif
1795 alignas(16) T lanes[8];
1796 Store(v, DFromV<decltype(v)>(), lanes);
1797 return lanes[i];
1798}
1799
1800template <typename T>
1801HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
1802#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1803 if (__builtin_constant_p(i)) {
1804 switch (i) {
1805 case 0:
1806 return detail::ExtractLane<0>(v);
1807 case 1:
1808 return detail::ExtractLane<1>(v);
1809 case 2:
1810 return detail::ExtractLane<2>(v);
1811 case 3:
1812 return detail::ExtractLane<3>(v);
1813 case 4:
1814 return detail::ExtractLane<4>(v);
1815 case 5:
1816 return detail::ExtractLane<5>(v);
1817 case 6:
1818 return detail::ExtractLane<6>(v);
1819 case 7:
1820 return detail::ExtractLane<7>(v);
1821 case 8:
1822 return detail::ExtractLane<8>(v);
1823 case 9:
1824 return detail::ExtractLane<9>(v);
1825 case 10:
1826 return detail::ExtractLane<10>(v);
1827 case 11:
1828 return detail::ExtractLane<11>(v);
1829 case 12:
1830 return detail::ExtractLane<12>(v);
1831 case 13:
1832 return detail::ExtractLane<13>(v);
1833 case 14:
1834 return detail::ExtractLane<14>(v);
1835 case 15:
1836 return detail::ExtractLane<15>(v);
1837 }
1838 }
1839#endif
1840 alignas(16) T lanes[16];
1841 Store(v, DFromV<decltype(v)>(), lanes);
1842 return lanes[i];
1843}
1844
1845// ------------------------------ GetLane
1846template <typename T, size_t N>
1847HWY_API T GetLane(const Vec128<T, N> v) {
1848 return detail::ExtractLane<0>(v);
1849}
1850
1851// ------------------------------ InsertLane
1852
1853namespace detail {
1854
1855template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1857 static_assert(kLane < N, "Lane index out of bounds");
1858 return Vec128<T, N>{
1859 wasm_i8x16_replace_lane(v.raw, kLane, static_cast<int8_t>(t))};
1860}
1861
1862template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1864 static_assert(kLane < N, "Lane index out of bounds");
1865 return Vec128<T, N>{
1866 wasm_i16x8_replace_lane(v.raw, kLane, static_cast<int16_t>(t))};
1867}
1868
1869template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1870HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
1871 static_assert(kLane < N, "Lane index out of bounds");
1872 return Vec128<T, N>{
1873 wasm_i32x4_replace_lane(v.raw, kLane, static_cast<int32_t>(t))};
1874}
1875
1876template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1877HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
1878 static_assert(kLane < N, "Lane index out of bounds");
1879 return Vec128<T, N>{
1880 wasm_i64x2_replace_lane(v.raw, kLane, static_cast<int64_t>(t))};
1881}
1882
1883template <size_t kLane, size_t N>
1885 static_assert(kLane < N, "Lane index out of bounds");
1886 return Vec128<float, N>{wasm_f32x4_replace_lane(v.raw, kLane, t)};
1887}
1888
1889template <size_t kLane, size_t N>
1891 static_assert(kLane < 2, "Lane index out of bounds");
1892 return Vec128<double, N>{wasm_f64x2_replace_lane(v.raw, kLane, t)};
1893}
1894
1895} // namespace detail
1896
1897// Requires one overload per vector length because InsertLane<3> may be a
1898// compile error if it calls wasm_f64x2_replace_lane.
1899
1900template <typename T>
1901HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
1902 HWY_DASSERT(i == 0);
1903 (void)i;
1904 return Set(DFromV<decltype(v)>(), t);
1905}
1906
1907template <typename T>
1908HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
1909#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1910 if (__builtin_constant_p(i)) {
1911 switch (i) {
1912 case 0:
1913 return detail::InsertLane<0>(v, t);
1914 case 1:
1915 return detail::InsertLane<1>(v, t);
1916 }
1917 }
1918#endif
1919 const DFromV<decltype(v)> d;
1920 alignas(16) T lanes[2];
1921 Store(v, d, lanes);
1922 lanes[i] = t;
1923 return Load(d, lanes);
1924}
1925
1926template <typename T>
1927HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
1928#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1929 if (__builtin_constant_p(i)) {
1930 switch (i) {
1931 case 0:
1932 return detail::InsertLane<0>(v, t);
1933 case 1:
1934 return detail::InsertLane<1>(v, t);
1935 case 2:
1936 return detail::InsertLane<2>(v, t);
1937 case 3:
1938 return detail::InsertLane<3>(v, t);
1939 }
1940 }
1941#endif
1942 const DFromV<decltype(v)> d;
1943 alignas(16) T lanes[4];
1944 Store(v, d, lanes);
1945 lanes[i] = t;
1946 return Load(d, lanes);
1947}
1948
1949template <typename T>
1950HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
1951#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1952 if (__builtin_constant_p(i)) {
1953 switch (i) {
1954 case 0:
1955 return detail::InsertLane<0>(v, t);
1956 case 1:
1957 return detail::InsertLane<1>(v, t);
1958 case 2:
1959 return detail::InsertLane<2>(v, t);
1960 case 3:
1961 return detail::InsertLane<3>(v, t);
1962 case 4:
1963 return detail::InsertLane<4>(v, t);
1964 case 5:
1965 return detail::InsertLane<5>(v, t);
1966 case 6:
1967 return detail::InsertLane<6>(v, t);
1968 case 7:
1969 return detail::InsertLane<7>(v, t);
1970 }
1971 }
1972#endif
1973 const DFromV<decltype(v)> d;
1974 alignas(16) T lanes[8];
1975 Store(v, d, lanes);
1976 lanes[i] = t;
1977 return Load(d, lanes);
1978}
1979
1980template <typename T>
1981HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
1982#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1983 if (__builtin_constant_p(i)) {
1984 switch (i) {
1985 case 0:
1986 return detail::InsertLane<0>(v, t);
1987 case 1:
1988 return detail::InsertLane<1>(v, t);
1989 case 2:
1990 return detail::InsertLane<2>(v, t);
1991 case 3:
1992 return detail::InsertLane<3>(v, t);
1993 case 4:
1994 return detail::InsertLane<4>(v, t);
1995 case 5:
1996 return detail::InsertLane<5>(v, t);
1997 case 6:
1998 return detail::InsertLane<6>(v, t);
1999 case 7:
2000 return detail::InsertLane<7>(v, t);
2001 case 8:
2002 return detail::InsertLane<8>(v, t);
2003 case 9:
2004 return detail::InsertLane<9>(v, t);
2005 case 10:
2006 return detail::InsertLane<10>(v, t);
2007 case 11:
2008 return detail::InsertLane<11>(v, t);
2009 case 12:
2010 return detail::InsertLane<12>(v, t);
2011 case 13:
2012 return detail::InsertLane<13>(v, t);
2013 case 14:
2014 return detail::InsertLane<14>(v, t);
2015 case 15:
2016 return detail::InsertLane<15>(v, t);
2017 }
2018 }
2019#endif
2020 const DFromV<decltype(v)> d;
2021 alignas(16) T lanes[16];
2022 Store(v, d, lanes);
2023 lanes[i] = t;
2024 return Load(d, lanes);
2025}
2026
2027// ------------------------------ LowerHalf
2028
2029template <typename T, size_t N>
2030HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
2031 Vec128<T, N> v) {
2032 return Vec128<T, N / 2>{v.raw};
2033}
2034
2035template <typename T, size_t N>
2036HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
2037 return LowerHalf(Simd<T, N / 2, 0>(), v);
2038}
2039
2040// ------------------------------ ShiftLeftBytes
2041
2042// 0x01..0F, kBytes = 1 => 0x02..0F00
2043template <int kBytes, typename T, size_t N>
2044HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
2045 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2046 const __i8x16 zero = wasm_i8x16_splat(0);
2047 switch (kBytes) {
2048 case 0:
2049 return v;
2050
2051 case 1:
2052 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
2053 6, 7, 8, 9, 10, 11, 12, 13, 14)};
2054
2055 case 2:
2056 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
2057 5, 6, 7, 8, 9, 10, 11, 12, 13)};
2058
2059 case 3:
2060 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2,
2061 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};
2062
2063 case 4:
2064 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1,
2065 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};
2066
2067 case 5:
2068 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0,
2069 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};
2070
2071 case 6:
2072 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2073 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
2074
2075 case 7:
2076 return Vec128<T, N>{wasm_i8x16_shuffle(
2077 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
2078
2079 case 8:
2080 return Vec128<T, N>{wasm_i8x16_shuffle(
2081 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
2082
2083 case 9:
2084 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2085 16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
2086 6)};
2087
2088 case 10:
2089 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2090 16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
2091 5)};
2092
2093 case 11:
2094 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2095 16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
2096 4)};
2097
2098 case 12:
2099 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2100 16, 16, 16, 16, 16, 16, 16, 0, 1,
2101 2, 3)};
2102
2103 case 13:
2104 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2105 16, 16, 16, 16, 16, 16, 16, 16, 0,
2106 1, 2)};
2107
2108 case 14:
2109 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2110 16, 16, 16, 16, 16, 16, 16, 16, 16,
2111 0, 1)};
2112
2113 case 15:
2114 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2115 16, 16, 16, 16, 16, 16, 16, 16, 16,
2116 16, 0)};
2117 }
2118 return Vec128<T, N>{zero};
2119}
2120
2121template <int kBytes, typename T, size_t N>
2122HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
2123 return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(), v);
2124}
2125
2126// ------------------------------ ShiftLeftLanes
2127
2128template <int kLanes, typename T, size_t N>
2129HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2130 const Repartition<uint8_t, decltype(d)> d8;
2131 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2132}
2133
2134template <int kLanes, typename T, size_t N>
2135HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
2136 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
2137}
2138
2139// ------------------------------ ShiftRightBytes
2140namespace detail {
2141
2142// Helper function allows zeroing invalid lanes in caller.
2143template <int kBytes, typename T, size_t N>
2145 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2146 const __i8x16 zero = wasm_i8x16_splat(0);
2147
2148 switch (kBytes) {
2149 case 0:
2150 return v.raw;
2151
2152 case 1:
2153 return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2154 12, 13, 14, 15, 16);
2155
2156 case 2:
2157 return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2158 13, 14, 15, 16, 16);
2159
2160 case 3:
2161 return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2162 13, 14, 15, 16, 16, 16);
2163
2164 case 4:
2165 return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2166 14, 15, 16, 16, 16, 16);
2167
2168 case 5:
2169 return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
2170 15, 16, 16, 16, 16, 16);
2171
2172 case 6:
2173 return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2174 16, 16, 16, 16, 16, 16);
2175
2176 case 7:
2177 return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2178 16, 16, 16, 16, 16, 16, 16);
2179
2180 case 8:
2181 return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
2182 16, 16, 16, 16, 16, 16, 16);
2183
2184 case 9:
2185 return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
2186 16, 16, 16, 16, 16, 16, 16);
2187
2188 case 10:
2189 return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
2190 16, 16, 16, 16, 16, 16, 16);
2191
2192 case 11:
2193 return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
2194 16, 16, 16, 16, 16, 16, 16);
2195
2196 case 12:
2197 return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
2198 16, 16, 16, 16, 16, 16, 16);
2199
2200 case 13:
2201 return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
2202 16, 16, 16, 16, 16, 16, 16);
2203
2204 case 14:
2205 return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
2206 16, 16, 16, 16, 16, 16, 16);
2207
2208 case 15:
2209 return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
2210 16, 16, 16, 16, 16, 16, 16);
2211 case 16:
2212 return zero;
2213 }
2214}
2215
2216} // namespace detail
2217
2218// 0x01..0F, kBytes = 1 => 0x0001..0E
2219template <int kBytes, typename T, size_t N>
2220HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
2221 // For partial vectors, clear upper lanes so we shift in zeros.
2222 if (N != 16 / sizeof(T)) {
2223 const Vec128<T> vfull{v.raw};
2224 v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
2225 }
2226 return Vec128<T, N>{detail::ShrBytes<kBytes>(v)};
2227}
2228
2229// ------------------------------ ShiftRightLanes
2230template <int kLanes, typename T, size_t N>
2231HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2232 const Repartition<uint8_t, decltype(d)> d8;
2233 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
2234}
2235
2236// ------------------------------ UpperHalf (ShiftRightBytes)
2237
2238// Full input: copy hi into lo (smaller instruction encoding than shifts).
2239template <typename T>
2241 return Vec64<T>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
2242}
2243HWY_API Vec64<float> UpperHalf(Full64<float> /* tag */, const Vec128<float> v) {
2244 return Vec64<float>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
2245}
2246
2247// Partial
2248template <typename T, size_t N, HWY_IF_LE64(T, N)>
2249HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
2250 Vec128<T, N> v) {
2251 const DFromV<decltype(v)> d;
2252 const RebindToUnsigned<decltype(d)> du;
2253 const auto vu = BitCast(du, v);
2254 const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
2255 return Vec128<T, (N + 1) / 2>{upper.raw};
2256}
2257
2258// ------------------------------ CombineShiftRightBytes
2259
2260template <int kBytes, typename T, class V = Vec128<T>>
2262 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2263 switch (kBytes) {
2264 case 0:
2265 return lo;
2266
2267 case 1:
2268 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2269 11, 12, 13, 14, 15, 16)};
2270
2271 case 2:
2272 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2273 11, 12, 13, 14, 15, 16, 17)};
2274
2275 case 3:
2276 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2277 12, 13, 14, 15, 16, 17, 18)};
2278
2279 case 4:
2280 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2281 13, 14, 15, 16, 17, 18, 19)};
2282
2283 case 5:
2284 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2285 14, 15, 16, 17, 18, 19, 20)};
2286
2287 case 6:
2288 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
2289 14, 15, 16, 17, 18, 19, 20, 21)};
2290
2291 case 7:
2292 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
2293 15, 16, 17, 18, 19, 20, 21, 22)};
2294
2295 case 8:
2296 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
2297 16, 17, 18, 19, 20, 21, 22, 23)};
2298
2299 case 9:
2300 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
2301 17, 18, 19, 20, 21, 22, 23, 24)};
2302
2303 case 10:
2304 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
2305 17, 18, 19, 20, 21, 22, 23, 24, 25)};
2306
2307 case 11:
2308 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
2309 18, 19, 20, 21, 22, 23, 24, 25, 26)};
2310
2311 case 12:
2312 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
2313 19, 20, 21, 22, 23, 24, 25, 26, 27)};
2314
2315 case 13:
2316 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
2317 20, 21, 22, 23, 24, 25, 26, 27, 28)};
2318
2319 case 14:
2320 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
2321 21, 22, 23, 24, 25, 26, 27, 28, 29)};
2322
2323 case 15:
2324 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
2325 22, 23, 24, 25, 26, 27, 28, 29, 30)};
2326 }
2327 return hi;
2328}
2329
2330template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
2331 class V = Vec128<T, N>>
2333 constexpr size_t kSize = N * sizeof(T);
2334 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
2335 const Repartition<uint8_t, decltype(d)> d8;
2336 const Full128<uint8_t> d_full8;
2337 using V8 = VFromD<decltype(d_full8)>;
2338 const V8 hi8{BitCast(d8, hi).raw};
2339 // Move into most-significant bytes
2340 const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
2341 const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
2342 return V{BitCast(Full128<T>(), r).raw};
2343}
2344
2345// ------------------------------ Broadcast/splat any lane
2346
2347template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2349 static_assert(0 <= kLane && kLane < N, "Invalid lane");
2350 return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
2351 kLane, kLane, kLane, kLane, kLane)};
2352}
2353
2354template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2355HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
2356 static_assert(0 <= kLane && kLane < N, "Invalid lane");
2357 return Vec128<T, N>{
2358 wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
2359}
2360
2361template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2362HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
2363 static_assert(0 <= kLane && kLane < N, "Invalid lane");
2364 return Vec128<T, N>{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)};
2365}
2366
2367// ------------------------------ TableLookupBytes
2368
2369// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
2370// lane indices in [0, 16).
2371template <typename T, size_t N, typename TI, size_t NI>
2372HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
2373 const Vec128<TI, NI> from) {
2374// Not yet available in all engines, see
2375// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
2376// V8 implementation of this had a bug, fixed on 2021-04-03:
2377// https://chromium-review.googlesource.com/c/v8/v8/+/2822951
2378#if 0
2379 return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
2380#else
2381 alignas(16) uint8_t control[16];
2382 alignas(16) uint8_t input[16];
2383 alignas(16) uint8_t output[16];
2384 wasm_v128_store(control, from.raw);
2385 wasm_v128_store(input, bytes.raw);
2386 for (size_t i = 0; i < 16; ++i) {
2387 output[i] = control[i] < 16 ? input[control[i]] : 0;
2388 }
2389 return Vec128<TI, NI>{wasm_v128_load(output)};
2390#endif
2391}
2392
2393template <typename T, size_t N, typename TI, size_t NI>
2394HWY_API Vec128<TI, NI> TableLookupBytesOr0(const Vec128<T, N> bytes,
2395 const Vec128<TI, NI> from) {
2396 const Simd<TI, NI, 0> d;
2397 // Mask size must match vector type, so cast everything to this type.
2398 Repartition<int8_t, decltype(d)> di8;
2399 Repartition<int8_t, Simd<T, N, 0>> d_bytes8;
2400 const auto msb = BitCast(di8, from) < Zero(di8);
2401 const auto lookup =
2402 TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
2403 return BitCast(d, IfThenZeroElse(msb, lookup));
2404}
2405
2406// ------------------------------ Hard-coded shuffles
2407
2408// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
2409// Shuffle0321 rotates one lane to the right (the previous least-significant
2410// lane is now most-significant). These could also be implemented via
2411// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
2412
2413// Swap 32-bit halves in 64-bit halves.
2414template <typename T, size_t N>
2416 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2417 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2418 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
2419}
2420
2421// These are used by generic_ops-inl to implement LoadInterleaved3.
2422namespace detail {
2423
2424template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2426 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2427 return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 0, 3 + 16, 2 + 16,
2428 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2429 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2430}
2431template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2433 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2434 return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 0, 3 + 8, 2 + 8,
2435 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2436}
2437template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2438HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> a, const Vec128<T, N> b) {
2439 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2440 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)};
2441}
2442
2443template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2445 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2446 return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 3, 2 + 16, 1 + 16,
2447 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2448 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2449}
2450template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2452 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2453 return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 3, 2 + 8, 1 + 8,
2454 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2455}
2456template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2457HWY_API Vec128<T, N> Shuffle1230(const Vec128<T, N> a, const Vec128<T, N> b) {
2458 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2459 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)};
2460}
2461
2462template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2464 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2465 return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 2, 1, 0 + 16, 3 + 16,
2466 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2467 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2468}
2469template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2471 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2472 return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 2, 1, 0 + 8, 3 + 8,
2473 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2474}
2475template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2476HWY_API Vec128<T, N> Shuffle3012(const Vec128<T, N> a, const Vec128<T, N> b) {
2477 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2478 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)};
2479}
2480
2481} // namespace detail
2482
2483// Swap 64-bit halves
2484template <typename T>
2485HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
2486 static_assert(sizeof(T) == 8, "Only for 64-bit lanes");
2487 return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
2488}
2489template <typename T>
2490HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
2491 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2492 return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
2493}
2494
2495// Rotate right 32 bits
2496template <typename T>
2497HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
2498 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2499 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
2500}
2501
2502// Rotate left 32 bits
2503template <typename T>
2504HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
2505 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2506 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
2507}
2508
2509// Reverse
2510template <typename T>
2511HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
2512 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2513 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
2514}
2515
2516// ------------------------------ TableLookupLanes
2517
2518// Returned by SetTableIndices for use by TableLookupLanes.
2519template <typename T, size_t N>
2520struct Indices128 {
2521 __v128_u raw;
2522};
2523
2524template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
2526 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2527#if HWY_IS_DEBUG_BUILD
2528 const Rebind<TI, decltype(d)> di;
2529 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
2530 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
2531#endif
2532
2533 const Repartition<uint8_t, decltype(d)> d8;
2534 using V8 = VFromD<decltype(d8)>;
2535 const Repartition<uint16_t, decltype(d)> d16;
2536
2537 // Broadcast each lane index to all bytes of T and shift to bytes
2538 static_assert(sizeof(T) == 4 || sizeof(T) == 8, "");
2539 if (sizeof(T) == 4) {
2540 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
2541 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
2542 const V8 lane_indices =
2543 TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
2544 const V8 byte_indices =
2545 BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
2546 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
2547 0, 1, 2, 3, 0, 1, 2, 3};
2548 return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
2549 } else {
2550 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
2551 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
2552 const V8 lane_indices =
2553 TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
2554 const V8 byte_indices =
2555 BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices)));
2556 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
2557 0, 1, 2, 3, 4, 5, 6, 7};
2558 return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
2559 }
2560}
2561
2562template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
2563HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
2564 const Rebind<TI, decltype(d)> di;
2565 return IndicesFromVec(d, LoadU(di, idx));
2566}
2567
2568template <typename T, size_t N>
2569HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
2570 using TI = MakeSigned<T>;
2571 const DFromV<decltype(v)> d;
2572 const Rebind<TI, decltype(d)> di;
2573 return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw}));
2574}
2575
2576// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
2577
2578// Single lane: no change
2579template <typename T>
2580HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
2581 return v;
2582}
2583
2584// Two lanes: shuffle
2585template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2586HWY_API Vec128<T, 2> Reverse(Simd<T, 2, 0> /* tag */, const Vec128<T, 2> v) {
2587 return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
2588}
2589
2590template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2591HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
2592 return Shuffle01(v);
2593}
2594
2595// Four lanes: shuffle
2596template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2597HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
2598 return Shuffle0123(v);
2599}
2600
2601// 16-bit
2602template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2603HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
2604 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
2605 return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
2606}
2607
2608// ------------------------------ Reverse2
2609
2610template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2611HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
2612 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
2613 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
2614}
2615
2616template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2617HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
2618 return Shuffle2301(v);
2619}
2620
2621template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2622HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
2623 return Shuffle01(v);
2624}
2625
2626// ------------------------------ Reverse4
2627
2628template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2629HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
2630 return BitCast(d, Vec128<uint16_t, N>{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2,
2631 1, 0, 7, 6, 5, 4)});
2632}
2633
2634template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2635HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
2636 return Shuffle0123(v);
2637}
2638
2639template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2640HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N>) {
2641 HWY_ASSERT(0); // don't have 8 u64 lanes
2642}
2643
2644// ------------------------------ Reverse8
2645
2646template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2647HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
2648 return Reverse(d, v);
2649}
2650
2651template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2652HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>, const Vec128<T, N>) {
2653 HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
2654}
2655
2656// ------------------------------ InterleaveLower
2657
2658template <size_t N>
2661 return Vec128<uint8_t, N>{wasm_i8x16_shuffle(
2662 a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2663}
2664template <size_t N>
2667 return Vec128<uint16_t, N>{
2668 wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2669}
2670template <size_t N>
2673 return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2674}
2675template <size_t N>
2678 return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
2679}
2680
2681template <size_t N>
2684 return Vec128<int8_t, N>{wasm_i8x16_shuffle(
2685 a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2686}
2687template <size_t N>
2690 return Vec128<int16_t, N>{
2691 wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2692}
2693template <size_t N>
2696 return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2697}
2698template <size_t N>
2701 return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
2702}
2703
2704template <size_t N>
2706 Vec128<float, N> b) {
2707 return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2708}
2709
2710template <size_t N>
2713 return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
2714}
2715
2716// Additional overload for the optional tag.
2717template <class V>
2718HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
2719 return InterleaveLower(a, b);
2720}
2721
2722// ------------------------------ InterleaveUpper (UpperHalf)
2723
2724// All functions inside detail lack the required D parameter.
2725namespace detail {
2726
2727template <size_t N>
2730 return Vec128<uint8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
2731 26, 11, 27, 12, 28, 13, 29, 14,
2732 30, 15, 31)};
2733}
2734template <size_t N>
2737 return Vec128<uint16_t, N>{
2738 wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2739}
2740template <size_t N>
2743 return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2744}
2745template <size_t N>
2748 return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
2749}
2750
2751template <size_t N>
2754 return Vec128<int8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
2755 26, 11, 27, 12, 28, 13, 29, 14,
2756 30, 15, 31)};
2757}
2758template <size_t N>
2761 return Vec128<int16_t, N>{
2762 wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2763}
2764template <size_t N>
2767 return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2768}
2769template <size_t N>
2772 return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
2773}
2774
2775template <size_t N>
2777 Vec128<float, N> b) {
2778 return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2779}
2780
2781template <size_t N>
2784 return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
2785}
2786
2787} // namespace detail
2788
2789// Full
2790template <typename T, class V = Vec128<T>>
2791HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
2792 return detail::InterleaveUpper(a, b);
2793}
2794
2795// Partial
2796template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
2797HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
2798 const Half<decltype(d)> d2;
2799 return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
2800}
2801
2802// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2803
2804// Same as Interleave*, except that the return lanes are double-width integers;
2805// this is necessary because the single-lane scalar cannot return two values.
2806template <class V, class DW = RepartitionToWide<DFromV<V>>>
2807HWY_API VFromD<DW> ZipLower(V a, V b) {
2808 return BitCast(DW(), InterleaveLower(a, b));
2809}
2810template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2811HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2812 return BitCast(dw, InterleaveLower(D(), a, b));
2813}
2814
2815template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2816HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2817 return BitCast(dw, InterleaveUpper(D(), a, b));
2818}
2819
2820// ================================================== COMBINE
2821
2822// ------------------------------ Combine (InterleaveLower)
2823
2824// N = N/2 + N/2 (upper half undefined)
2825template <typename T, size_t N>
2826HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
2827 Vec128<T, N / 2> lo_half) {
2828 const Half<decltype(d)> d2;
2829 const RebindToUnsigned<decltype(d2)> du2;
2830 // Treat half-width input as one lane, and expand to two lanes.
2831 using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
2832 const VU lo{BitCast(du2, lo_half).raw};
2833 const VU hi{BitCast(du2, hi_half).raw};
2834 return BitCast(d, InterleaveLower(lo, hi));
2835}
2836
2837// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
2838
2839template <typename T, size_t N>
2840HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
2841 return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
2842}
2843
2844// ------------------------------ ConcatLowerLower
2845
2846// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2847template <typename T>
2849 const Vec128<T> lo) {
2850 return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
2851}
2852template <typename T, size_t N, HWY_IF_LE64(T, N)>
2853HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
2854 const Vec128<T, N> lo) {
2855 const Half<decltype(d)> d2;
2856 return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
2857}
2858
2859// ------------------------------ ConcatUpperUpper
2860
2861template <typename T>
2863 const Vec128<T> lo) {
2864 return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
2865}
2866template <typename T, size_t N, HWY_IF_LE64(T, N)>
2867HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
2868 const Vec128<T, N> lo) {
2869 const Half<decltype(d)> d2;
2870 return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
2871}
2872
2873// ------------------------------ ConcatLowerUpper
2874
2875template <typename T>
2877 const Vec128<T> lo) {
2878 return CombineShiftRightBytes<8>(d, hi, lo);
2879}
2880template <typename T, size_t N, HWY_IF_LE64(T, N)>
2881HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
2882 const Vec128<T, N> lo) {
2883 const Half<decltype(d)> d2;
2884 return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
2885}
2886
2887// ------------------------------ ConcatUpperLower
2888template <typename T, size_t N>
2889HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
2890 const Vec128<T, N> lo) {
2891 return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
2892}
2893
2894// ------------------------------ ConcatOdd
2895
2896// 8-bit full
2897template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2899 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15,
2900 17, 19, 21, 23, 25, 27, 29, 31)};
2901}
2902
2903// 8-bit x8
2904template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2906 Vec128<T, 8> lo) {
2907 // Don't care about upper half.
2908 return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21,
2909 23, 1, 3, 5, 7, 17, 19, 21, 23)};
2910}
2911
2912// 8-bit x4
2913template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2914HWY_API Vec128<T, 4> ConcatOdd(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
2915 Vec128<T, 4> lo) {
2916 // Don't care about upper 3/4.
2917 return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17,
2918 19, 1, 3, 17, 19, 1, 3, 17, 19)};
2919}
2920
2921// 16-bit full
2922template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2923HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2924 return Vec128<T>{
2925 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)};
2926}
2927
2928// 16-bit x4
2929template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2930HWY_API Vec128<T, 4> ConcatOdd(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
2931 Vec128<T, 4> lo) {
2932 // Don't care about upper half.
2933 return Vec128<T, 4>{
2934 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)};
2935}
2936
2937// 32-bit full
2938template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2939HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2940 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
2941}
2942
2943// Any T x2
2944template <typename T>
2945HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2, 0> d, Vec128<T, 2> hi,
2946 Vec128<T, 2> lo) {
2947 return InterleaveUpper(d, lo, hi);
2948}
2949
2950// ------------------------------ ConcatEven (InterleaveLower)
2951
2952// 8-bit full
2953template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2955 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14,
2956 16, 18, 20, 22, 24, 26, 28, 30)};
2957}
2958
2959// 8-bit x8
2960template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2962 Vec128<T, 8> lo) {
2963 // Don't care about upper half.
2964 return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20,
2965 22, 0, 2, 4, 6, 16, 18, 20, 22)};
2966}
2967
2968// 8-bit x4
2969template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2970HWY_API Vec128<T, 4> ConcatEven(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
2971 Vec128<T, 4> lo) {
2972 // Don't care about upper 3/4.
2973 return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16,
2974 18, 0, 2, 16, 18, 0, 2, 16, 18)};
2975}
2976
2977// 16-bit full
2978template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2979HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2980 return Vec128<T>{
2981 wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)};
2982}
2983
2984// 16-bit x4
2985template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2986HWY_API Vec128<T, 4> ConcatEven(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
2987 Vec128<T, 4> lo) {
2988 // Don't care about upper half.
2989 return Vec128<T, 4>{
2990 wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)};
2991}
2992
2993// 32-bit full
2994template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2995HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2996 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
2997}
2998
2999// Any T x2
3000template <typename T>
3001HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2, 0> d, Vec128<T, 2> hi,
3002 Vec128<T, 2> lo) {
3003 return InterleaveLower(d, lo, hi);
3004}
3005
3006// ------------------------------ DupEven (InterleaveLower)
3007
3008template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3009HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
3010 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)};
3011}
3012
3013template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3014HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
3015 return InterleaveLower(DFromV<decltype(v)>(), v, v);
3016}
3017
3018// ------------------------------ DupOdd (InterleaveUpper)
3019
3020template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3021HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
3022 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)};
3023}
3024
3025template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3026HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
3027 return InterleaveUpper(DFromV<decltype(v)>(), v, v);
3028}
3029
3030// ------------------------------ OddEven
3031
3032namespace detail {
3033
3034template <typename T, size_t N>
3036 const Vec128<T, N> b) {
3037 const DFromV<decltype(a)> d;
3038 const Repartition<uint8_t, decltype(d)> d8;
3039 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3040 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3041 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
3042}
3043template <typename T, size_t N>
3045 const Vec128<T, N> b) {
3046 return Vec128<T, N>{
3047 wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
3048}
3049template <typename T, size_t N>
3051 const Vec128<T, N> b) {
3052 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
3053}
3054template <typename T, size_t N>
3056 const Vec128<T, N> b) {
3057 return Vec128<T, N>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
3058}
3059
3060} // namespace detail
3061
3062template <typename T, size_t N>
3063HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
3064 return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
3065}
3066template <size_t N>
3068 const Vec128<float, N> b) {
3069 return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
3070}
3071
3072// ------------------------------ OddEvenBlocks
3073template <typename T, size_t N>
3074HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
3075 return even;
3076}
3077
3078// ------------------------------ SwapAdjacentBlocks
3079
3080template <typename T, size_t N>
3081HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
3082 return v;
3083}
3084
3085// ------------------------------ ReverseBlocks
3086
3087// Single block: no change
3088template <typename T>
3089HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
3090 return v;
3091}
3092
3093// ================================================== CONVERT
3094
3095// ------------------------------ Promotions (part w/ narrow lanes -> full)
3096
3097// Unsigned: zero-extend.
3098template <size_t N>
3100 const Vec128<uint8_t, N> v) {
3101 return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
3102}
3103template <size_t N>
3105 const Vec128<uint8_t, N> v) {
3106 return Vec128<uint32_t, N>{
3107 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
3108}
3109template <size_t N>
3111 const Vec128<uint8_t, N> v) {
3112 return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
3113}
3114template <size_t N>
3116 const Vec128<uint8_t, N> v) {
3117 return Vec128<int32_t, N>{
3118 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
3119}
3120template <size_t N>
3121HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
3122 const Vec128<uint16_t, N> v) {
3123 return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
3124}
3125template <size_t N>
3127 const Vec128<uint32_t, N> v) {
3128 return Vec128<uint64_t, N>{wasm_u64x2_extend_low_u32x4(v.raw)};
3129}
3130
3131template <size_t N>
3133 const Vec128<uint16_t, N> v) {
3134 return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
3135}
3136
3137// Signed: replicate sign bit.
3138template <size_t N>
3139HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
3140 const Vec128<int8_t, N> v) {
3141 return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
3142}
3143template <size_t N>
3144HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
3145 const Vec128<int8_t, N> v) {
3146 return Vec128<int32_t, N>{
3147 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
3148}
3149template <size_t N>
3150HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
3151 const Vec128<int16_t, N> v) {
3152 return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
3153}
3154template <size_t N>
3155HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
3156 const Vec128<int32_t, N> v) {
3157 return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(v.raw)};
3158}
3159
3160template <size_t N>
3162 const Vec128<int32_t, N> v) {
3163 return Vec128<double, N>{wasm_f64x2_convert_low_i32x4(v.raw)};
3164}
3165
3166template <size_t N>
3167HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
3168 const Vec128<float16_t, N> v) {
3169 const RebindToSigned<decltype(df32)> di32;
3170 const RebindToUnsigned<decltype(df32)> du32;
3171 // Expand to u32 so we can shift.
3172 const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
3173 const auto sign = ShiftRight<15>(bits16);
3174 const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
3175 const auto mantissa = bits16 & Set(du32, 0x3FF);
3176 const auto subnormal =
3177 BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
3178 Set(df32, 1.0f / 16384 / 1024));
3179
3180 const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
3181 const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
3182 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3183 const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
3184 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3185}
3186
3187template <size_t N>
3188HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
3189 const Vec128<bfloat16_t, N> v) {
3190 const Rebind<uint16_t, decltype(df32)> du16;
3191 const RebindToSigned<decltype(df32)> di32;
3192 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
3193}
3194
3195// ------------------------------ Demotions (full -> part w/ narrow lanes)
3196
3197template <size_t N>
3199 const Vec128<int32_t, N> v) {
3200 return Vec128<uint16_t, N>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
3201}
3202
3203template <size_t N>
3205 const Vec128<int32_t, N> v) {
3206 return Vec128<int16_t, N>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
3207}
3208
3209template <size_t N>
3211 const Vec128<int32_t, N> v) {
3212 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
3213 return Vec128<uint8_t, N>{
3214 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
3215}
3216
3217template <size_t N>
3219 const Vec128<int16_t, N> v) {
3220 return Vec128<uint8_t, N>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
3221}
3222
3223template <size_t N>
3225 const Vec128<int32_t, N> v) {
3226 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
3227 return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
3228}
3229
3230template <size_t N>
3232 const Vec128<int16_t, N> v) {
3233 return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
3234}
3235
3236template <size_t N>
3237HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N, 0> /* di */,
3238 const Vec128<double, N> v) {
3239 return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
3240}
3241
3242template <size_t N>
3243HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
3244 const Vec128<float, N> v) {
3245 const RebindToUnsigned<decltype(df16)> du16;
3246 const Rebind<uint32_t, decltype(du16)> du;
3247 const RebindToSigned<decltype(du)> di;
3248 const auto bits32 = BitCast(du, v);
3249 const auto sign = ShiftRight<31>(bits32);
3250 const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
3251 const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
3252
3253 const auto k15 = Set(di, 15);
3254 const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
3255 const auto is_tiny = exp < Set(di, -24);
3256
3257 const auto is_subnormal = exp < Set(di, -14);
3258 const auto biased_exp16 =
3259 BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
3260 const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
3261 const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
3262 (mantissa32 >> (Set(du, 13) + sub_exp));
3263 const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
3264 ShiftRight<13>(mantissa32)); // <1024
3265
3266 const auto sign16 = ShiftLeft<15>(sign);
3267 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3268 const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
3269 return Vec128<float16_t, N>{DemoteTo(du16, bits16).raw};
3270}
3271
3272template <size_t N>
3273HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
3274 const Vec128<float, N> v) {
3275 const Rebind<int32_t, decltype(dbf16)> di32;
3276 const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
3277 const Rebind<uint16_t, decltype(dbf16)> du16;
3278 const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
3279 return BitCast(dbf16, DemoteTo(du16, bits_in_32));
3280}
3281
3282template <size_t N>
3283HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
3284 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
3285 const RebindToUnsigned<decltype(dbf16)> du16;
3286 const Repartition<uint32_t, decltype(dbf16)> du32;
3287 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
3288 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
3289}
3290
3291// For already range-limited input [0, 255].
3292template <size_t N>
3293HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
3294 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
3295 return Vec128<uint8_t, N>{
3296 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
3297}
3298
3299// ------------------------------ Convert i32 <=> f32 (Round)
3300
3301template <size_t N>
3303 const Vec128<int32_t, N> v) {
3304 return Vec128<float, N>{wasm_f32x4_convert_i32x4(v.raw)};
3305}
3306// Truncates (rounds toward zero).
3307template <size_t N>
3309 const Vec128<float, N> v) {
3310 return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
3311}
3312
3313template <size_t N>
3314HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
3315 return ConvertTo(Simd<int32_t, N, 0>(), Round(v));
3316}
3317
3318// ================================================== MISC
3319
3320// ------------------------------ SumsOf8 (ShiftRight, Add)
3321template <size_t N>
3323 const DFromV<decltype(v)> du8;
3324 const RepartitionToWide<decltype(du8)> du16;
3325 const RepartitionToWide<decltype(du16)> du32;
3326 const RepartitionToWide<decltype(du32)> du64;
3327 using VU16 = VFromD<decltype(du16)>;
3328
3329 const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
3330 const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF));
3331 const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
3332
3333 const VU16 szz_FE_zz_BA_zz_76_zz_32 =
3334 BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
3335 const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
3336 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
3337 const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
3338 BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
3339 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
3340 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
3341 return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF));
3342}
3343
3344// ------------------------------ LoadMaskBits (TestBit)
3345
3346namespace detail {
3347
3348template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
3349HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
3350 const RebindToUnsigned<decltype(d)> du;
3351 // Easier than Set(), which would require an >8-bit type, which would not
3352 // compile for T=uint8_t, N=1.
3353 const Vec128<T, N> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};
3354
3355 // Replicate bytes 8x such that each byte contains the bit that governs it.
3356 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
3357 1, 1, 1, 1, 1, 1, 1, 1};
3358 const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
3359
3360 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
3361 1, 2, 4, 8, 16, 32, 64, 128};
3362 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
3363}
3364
3365template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3366HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
3367 const RebindToUnsigned<decltype(d)> du;
3368 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
3369 return RebindMask(
3370 d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit)));
3371}
3372
3373template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3374HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
3375 const RebindToUnsigned<decltype(d)> du;
3376 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
3377 return RebindMask(
3378 d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit)));
3379}
3380
3381template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3382HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
3383 const RebindToUnsigned<decltype(d)> du;
3384 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
3385 return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
3386}
3387
3388} // namespace detail
3389
3390// `p` points to at least 8 readable bytes, not all of which need be valid.
3391template <typename T, size_t N, HWY_IF_LE128(T, N)>
3392HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
3393 const uint8_t* HWY_RESTRICT bits) {
3394 uint64_t mask_bits = 0;
3395 CopyBytes<(N + 7) / 8>(bits, &mask_bits);
3396 return detail::LoadMaskBits(d, mask_bits);
3397}
3398
3399// ------------------------------ Mask
3400
3401namespace detail {
3402
3403// Full
3404template <typename T>
3405HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
3406 const Mask128<T> mask) {
3407 alignas(16) uint64_t lanes[2];
3408 wasm_v128_store(lanes, mask.raw);
3409
3410 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3411 const uint64_t lo = ((lanes[0] * kMagic) >> 56);
3412 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
3413 return (hi + lo);
3414}
3415
3416// 64-bit
3417template <typename T>
3419 const Mask128<T, 8> mask) {
3420 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3421 return (static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)) *
3422 kMagic) >>
3423 56;
3424}
3425
3426// 32-bit or less: need masking
3427template <typename T, size_t N, HWY_IF_LE32(T, N)>
3428HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
3429 const Mask128<T, N> mask) {
3430 uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
3431 // Clear potentially undefined bytes.
3432 bytes &= (1ULL << (N * 8)) - 1;
3433 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3434 return (bytes * kMagic) >> 56;
3435}
3436
3437template <typename T, size_t N>
3439 const Mask128<T, N> mask) {
3440 // Remove useless lower half of each u16 while preserving the sign bit.
3441 const __i16x8 zero = wasm_i16x8_splat(0);
3442 const Mask128<uint8_t, N> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
3443 return BitsFromMask(hwy::SizeTag<1>(), mask8);
3444}
3445
3446template <typename T, size_t N>
3448 const Mask128<T, N> mask) {
3449 const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
3450 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
3451 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
3452 alignas(16) uint32_t lanes[4];
3453 wasm_v128_store(lanes, sliced_mask);
3454 return lanes[0] | lanes[1] | lanes[2] | lanes[3];
3455}
3456
3457template <typename T, size_t N>
3459 const Mask128<T, N> mask) {
3460 const __i64x2 mask_i = static_cast<__i64x2>(mask.raw);
3461 const __i64x2 slice = wasm_i64x2_make(1, 2);
3462 const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
3463 alignas(16) uint64_t lanes[2];
3464 wasm_v128_store(lanes, sliced_mask);
3465 return lanes[0] | lanes[1];
3466}
3467
3468// Returns the lowest N bits for the BitsFromMask result.
3469template <typename T, size_t N>
3470constexpr uint64_t OnlyActive(uint64_t bits) {
3471 return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
3472}
3473
3474// Returns 0xFF for bytes with index >= N, otherwise 0.
3475template <size_t N>
3476constexpr __i8x16 BytesAbove() {
3477 return
3478 (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
3479 : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
3480 : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
3481 : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
3482 : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
3483 : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
3484 : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
3485 : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
3486 : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
3487 : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3488 -1, -1, -1, -1, -1)
3489 : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3490 -1, -1, -1, -1)
3491 : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
3492 -1, -1, -1, -1)
3493 : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
3494 -1, -1, -1)
3495 : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
3496 -1, -1, -1)
3497 : (N == 11)
3498 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
3499 : (N == 13)
3500 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
3501 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
3502}
3503
3504template <typename T, size_t N>
3505HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
3506 return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
3507}
3508
3509template <typename T>
3510HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
3511 return PopCount(BitsFromMask(tag, m));
3512}
3513
3514template <typename T>
3515HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
3516 return PopCount(BitsFromMask(tag, m));
3517}
3518
3519template <typename T>
3520HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
3521 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
3522 const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
3523 alignas(16) uint64_t lanes[2];
3524 wasm_v128_store(lanes, shifted_bits);
3525 return PopCount(lanes[0] | lanes[1]);
3526}
3527
3528template <typename T>
3529HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
3530 alignas(16) int64_t lanes[2];
3531 wasm_v128_store(lanes, m.raw);
3532 return static_cast<size_t>(-(lanes[0] + lanes[1]));
3533}
3534
3535} // namespace detail
3536
3537// `p` points to at least 8 writable bytes.
3538template <typename T, size_t N>
3539HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
3540 const Mask128<T, N> mask, uint8_t* bits) {
3541 const uint64_t mask_bits = detail::BitsFromMask(mask);
3542 const size_t kNumBytes = (N + 7) / 8;
3543 CopyBytes<kNumBytes>(&mask_bits, bits);
3544 return kNumBytes;
3545}
3546
3547template <typename T, size_t N>
3548HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
3549 return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), m);
3550}
3551
3552// Partial vector
3553template <typename T, size_t N, HWY_IF_LE64(T, N)>
3554HWY_API size_t CountTrue(const Simd<T, N, 0> d, const Mask128<T, N> m) {
3555 // Ensure all undefined bytes are 0.
3556 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3557 return CountTrue(d, Mask128<T>{AndNot(mask, m).raw});
3558}
3559
3560// Full vector
3561template <typename T>
3563#if 0
3564 // Casting followed by wasm_i8x16_any_true results in wasm error:
3565 // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
3566 const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(d, m));
3567 return !wasm_i8x16_any_true(v8.raw);
3568#else
3569 (void)d;
3570 return (wasm_i64x2_extract_lane(m.raw, 0) |
3571 wasm_i64x2_extract_lane(m.raw, 1)) == 0;
3572#endif
3573}
3574
3575// Full vector
3576namespace detail {
3577template <typename T>
3579 return wasm_i8x16_all_true(m.raw);
3580}
3581template <typename T>
3583 return wasm_i16x8_all_true(m.raw);
3584}
3585template <typename T>
3587 return wasm_i32x4_all_true(m.raw);
3588}
3589template <typename T>
3591 return wasm_i64x2_all_true(m.raw);
3592}
3593
3594} // namespace detail
3595
3596template <typename T, size_t N>
3597HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
3598 return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
3599}
3600
3601// Partial vectors
3602
3603template <typename T, size_t N, HWY_IF_LE64(T, N)>
3605 // Ensure all undefined bytes are 0.
3606 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3607 return AllFalse(Full128<T>(), Mask128<T>{AndNot(mask, m).raw});
3608}
3609
3610template <typename T, size_t N, HWY_IF_LE64(T, N)>
3611HWY_API bool AllTrue(const Simd<T, N, 0> /* d */, const Mask128<T, N> m) {
3612 // Ensure all undefined bytes are FF.
3613 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3614 return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw});
3615}
3616
3617template <typename T, size_t N>
3618HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
3619 const Mask128<T, N> mask) {
3620 const uint64_t bits = detail::BitsFromMask(mask);
3621 return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1;
3622}
3623
3624// ------------------------------ Compress
3625
3626namespace detail {
3627
3628template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3629HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
3630 HWY_DASSERT(mask_bits < 256);
3631 const Simd<T, N, 0> d;
3632 const Rebind<uint8_t, decltype(d)> d8;
3633 const Simd<uint16_t, N, 0> du;
3634
3635 // We need byte indices for TableLookupBytes (one vector's worth for each of
3636 // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
3637 // can instead store lane indices and convert to byte indices (2*lane + 0..1),
3638 // with the doubling baked into the table. Unpacking nibbles is likely more
3639 // costly than the higher cache footprint from storing bytes.
3640 alignas(16) constexpr uint8_t table[256 * 8] = {
3641 // PrintCompress16x8Tables
3642 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3643 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3644 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
3645 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3646 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
3647 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
3648 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
3649 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3650 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
3651 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
3652 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
3653 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
3654 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
3655 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
3656 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
3657 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3658 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
3659 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
3660 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
3661 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
3662 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
3663 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
3664 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
3665 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
3666 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
3667 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
3668 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
3669 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
3670 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
3671 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
3672 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
3673 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3674 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
3675 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
3676 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
3677 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
3678 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
3679 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
3680 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
3681 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
3682 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
3683 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
3684 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
3685 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
3686 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
3687 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
3688 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
3689 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
3690 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
3691 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
3692 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
3693 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
3694 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
3695 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
3696 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
3697 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
3698 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
3699 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
3700 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
3701 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
3702 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
3703 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
3704 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
3705 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3706 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
3707 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
3708 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
3709 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
3710 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
3711 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
3712 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
3713 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
3714 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
3715 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
3716 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
3717 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
3718 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
3719 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
3720 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
3721 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
3722 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
3723 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
3724 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
3725 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
3726 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
3727 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
3728 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
3729 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
3730 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
3731 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
3732 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
3733 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
3734 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
3735 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
3736 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
3737 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
3738 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
3739 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
3740 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
3741 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
3742 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
3743 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
3744 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
3745 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
3746 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
3747 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
3748 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
3749 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
3750 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
3751 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
3752 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
3753 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
3754 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
3755 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
3756 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
3757 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
3758 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
3759 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
3760 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
3761 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
3762 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
3763 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
3764 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
3765 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
3766 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
3767 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
3768 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
3769 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
3770
3771 const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
3772 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
3773 return BitCast(d, pairs + Set(du, 0x0100));
3774}
3775
3776template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3777HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
3778 HWY_DASSERT(mask_bits < 256);
3779 const Simd<T, N, 0> d;
3780 const Rebind<uint8_t, decltype(d)> d8;
3781 const Simd<uint16_t, N, 0> du;
3782
3783 // We need byte indices for TableLookupBytes (one vector's worth for each of
3784 // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
3785 // can instead store lane indices and convert to byte indices (2*lane + 0..1),
3786 // with the doubling baked into the table. Unpacking nibbles is likely more
3787 // costly than the higher cache footprint from storing bytes.
3788 alignas(16) constexpr uint8_t table[256 * 8] = {
3789 // PrintCompressNot16x8Tables
3790 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, //
3791 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, //
3792 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, //
3793 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, //
3794 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, //
3795 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, //
3796 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, //
3797 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, //
3798 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, //
3799 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, //
3800 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, //
3801 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, //
3802 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, //
3803 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, //
3804 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, //
3805 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, //
3806 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, //
3807 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, //
3808 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, //
3809 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, //
3810 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, //
3811 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, //
3812 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, //
3813 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, //
3814 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, //
3815 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, //
3816 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, //
3817 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, //
3818 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, //
3819 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, //
3820 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, //
3821 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, //
3822 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, //
3823 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, //
3824 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, //
3825 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, //
3826 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, //
3827 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, //
3828 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, //
3829 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, //
3830 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, //
3831 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, //
3832 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, //
3833 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, //
3834 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, //
3835 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, //
3836 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, //
3837 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, //
3838 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, //
3839 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, //
3840 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, //
3841 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, //
3842 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, //
3843 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, //
3844 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, //
3845 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, //
3846 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, //
3847 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, //
3848 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, //
3849 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, //
3850 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, //
3851 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, //
3852 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, //
3853 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, //
3854 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, //
3855 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, //
3856 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, //
3857 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, //
3858 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, //
3859 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, //
3860 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, //
3861 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, //
3862 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, //
3863 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, //
3864 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, //
3865 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, //
3866 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, //
3867 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, //
3868 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, //
3869 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, //
3870 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, //
3871 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, //
3872 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, //
3873 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, //
3874 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, //
3875 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, //
3876 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, //
3877 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, //
3878 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, //
3879 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, //
3880 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, //
3881 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, //
3882 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, //
3883 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, //
3884 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, //
3885 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, //
3886 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, //
3887 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, //
3888 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, //
3889 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, //
3890 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, //
3891 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, //
3892 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, //
3893 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, //
3894 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, //
3895 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, //
3896 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, //
3897 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, //
3898 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, //
3899 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, //
3900 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, //
3901 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, //
3902 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, //
3903 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, //
3904 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, //
3905 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, //
3906 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, //
3907 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, //
3908 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, //
3909 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, //
3910 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, //
3911 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, //
3912 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, //
3913 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, //
3914 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, //
3915 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, //
3916 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, //
3917 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
3918
3919 const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
3920 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
3921 return BitCast(d, pairs + Set(du, 0x0100));
3922}
3923
3924template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3925HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
3926 HWY_DASSERT(mask_bits < 16);
3927
3928 // There are only 4 lanes, so we can afford to load the index vector directly.
3929 alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
3930 // PrintCompress32x4Tables
3931 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
3932 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
3933 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
3934 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
3935 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
3936 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
3937 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
3938 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
3939 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
3940 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
3941 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
3942 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
3943 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
3944 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
3945 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
3946 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3947 const Simd<T, N, 0> d;
3948 const Repartition<uint8_t, decltype(d)> d8;
3949 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
3950}
3951
3952template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3953HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
3954 HWY_DASSERT(mask_bits < 16);
3955
3956 // There are only 4 lanes, so we can afford to load the index vector directly.
3957 alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
3958 // PrintCompressNot32x4Tables
3959 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
3960 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
3961 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
3962 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
3963 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
3964 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
3965 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
3966 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3967 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
3968 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
3969 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
3970 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
3971 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
3972 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
3973 12, 13, 14, 15};
3974 const Simd<T, N, 0> d;
3975 const Repartition<uint8_t, decltype(d)> d8;
3976 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
3977}
3978
3979template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3980HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
3981 HWY_DASSERT(mask_bits < 4);
3982
3983 // There are only 2 lanes, so we can afford to load the index vector directly.
3984 alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
3985 // PrintCompress64x2Tables
3986 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3987 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3988 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
3989 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3990
3991 const Simd<T, N, 0> d;
3992 const Repartition<uint8_t, decltype(d)> d8;
3993 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
3994}
3995
3996template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3997HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
3998 HWY_DASSERT(mask_bits < 4);
3999
4000 // There are only 2 lanes, so we can afford to load the index vector directly.
4001 alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
4002 // PrintCompressNot64x2Tables
4003 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4004 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
4005 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4006 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4007
4008 const Simd<T, N, 0> d;
4009 const Repartition<uint8_t, decltype(d)> d8;
4010 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
4011}
4012
4013// Helper functions called by both Compress and CompressStore - avoids a
4014// redundant BitsFromMask in the latter.
4015
4016template <typename T, size_t N>
4017HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
4018 const auto idx = detail::IdxFromBits<T, N>(mask_bits);
4019 const DFromV<decltype(v)> d;
4020 const RebindToSigned<decltype(d)> di;
4021 return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
4022}
4023
4024template <typename T, size_t N>
4025HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
4026 const auto idx = detail::IdxFromNotBits<T, N>(mask_bits);
4027 const DFromV<decltype(v)> d;
4028 const RebindToSigned<decltype(d)> di;
4029 return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
4030}
4031
4032} // namespace detail
4033
4034template <typename T>
4035struct CompressIsPartition {
4036 enum { value = 1 };
4037};
4038
4039// Single lane: no-op
4040template <typename T>
4041HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
4042 return v;
4043}
4044
4045// Two lanes: conditional swap
4046template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4048 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
4049 const Full128<T> d;
4050 const Vec128<T> m = VecFromMask(d, mask);
4051 const Vec128<T> maskL = DupEven(m);
4052 const Vec128<T> maskH = DupOdd(m);
4053 const Vec128<T> swap = AndNot(maskL, maskH);
4054 return IfVecThenElse(swap, Shuffle01(v), v);
4055}
4056
4057// General case
4058template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
4059HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
4061}
4062
4063// Single lane: no-op
4064template <typename T>
4065HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
4066 return v;
4067}
4068
4069// Two lanes: conditional swap
4070template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4071HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
4072 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
4073 const Full128<T> d;
4074 const Vec128<T> m = VecFromMask(d, mask);
4075 const Vec128<T> maskL = DupEven(m);
4076 const Vec128<T> maskH = DupOdd(m);
4077 const Vec128<T> swap = AndNot(maskH, maskL);
4078 return IfVecThenElse(swap, Shuffle01(v), v);
4079}
4080
4081// General case
4082template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
4083HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
4084 // For partial vectors, we cannot pull the Not() into the table because
4085 // BitsFromMask clears the upper bits.
4086 if (N < 16 / sizeof(T)) {
4088 }
4090}
4091// ------------------------------ CompressBlocksNot
4092HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
4093 Mask128<uint64_t> /* m */) {
4094 return v;
4095}
4096
4097// ------------------------------ CompressBits
4098
4099template <typename T, size_t N>
4100HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
4101 const uint8_t* HWY_RESTRICT bits) {
4102 uint64_t mask_bits = 0;
4103 constexpr size_t kNumBytes = (N + 7) / 8;
4104 CopyBytes<kNumBytes>(bits, &mask_bits);
4105 if (N < 8) {
4106 mask_bits &= (1ull << N) - 1;
4107 }
4108
4109 return detail::Compress(v, mask_bits);
4110}
4111
4112// ------------------------------ CompressStore
4113template <typename T, size_t N>
4114HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
4115 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
4116 const uint64_t mask_bits = detail::BitsFromMask(mask);
4117 const auto c = detail::Compress(v, mask_bits);
4118 StoreU(c, d, unaligned);
4119 return PopCount(mask_bits);
4120}
4121
4122// ------------------------------ CompressBlendedStore
4123template <typename T, size_t N>
4124HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
4125 Simd<T, N, 0> d,
4126 T* HWY_RESTRICT unaligned) {
4127 const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
4128 using TU = TFromD<decltype(du)>;
4129 const uint64_t mask_bits = detail::BitsFromMask(m);
4130 const size_t count = PopCount(mask_bits);
4131 const Vec128<TU, N> compressed = detail::Compress(BitCast(du, v), mask_bits);
4132 const Mask128<T, N> store_mask = RebindMask(d, FirstN(du, count));
4133 BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
4134 return count;
4135}
4136
4137// ------------------------------ CompressBitsStore
4138
4139template <typename T, size_t N>
4140HWY_API size_t CompressBitsStore(Vec128<T, N> v,
4141 const uint8_t* HWY_RESTRICT bits,
4142 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
4143 uint64_t mask_bits = 0;
4144 constexpr size_t kNumBytes = (N + 7) / 8;
4145 CopyBytes<kNumBytes>(bits, &mask_bits);
4146 if (N < 8) {
4147 mask_bits &= (1ull << N) - 1;
4148 }
4149
4150 const auto c = detail::Compress(v, mask_bits);
4151 StoreU(c, d, unaligned);
4152 return PopCount(mask_bits);
4153}
4154
4155// ------------------------------ StoreInterleaved2/3/4
4156
4157// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
4158// generic_ops-inl.h.
4159
4160// ------------------------------ MulEven/Odd (Load)
4161
4162HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
4163 const Vec128<uint64_t> b) {
4164 alignas(16) uint64_t mul[2];
4165 mul[0] =
4166 Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
4167 static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
4168 return Load(Full128<uint64_t>(), mul);
4169}
4170
4171HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
4172 const Vec128<uint64_t> b) {
4173 alignas(16) uint64_t mul[2];
4174 mul[0] =
4175 Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
4176 static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
4177 return Load(Full128<uint64_t>(), mul);
4178}
4179
4180// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
4181
4182template <size_t N>
4183HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
4184 Vec128<bfloat16_t, 2 * N> a,
4185 Vec128<bfloat16_t, 2 * N> b,
4186 const Vec128<float, N> sum0,
4187 Vec128<float, N>& sum1) {
4188 const Repartition<uint16_t, decltype(df32)> du16;
4189 const RebindToUnsigned<decltype(df32)> du32;
4190 const Vec128<uint16_t, 2 * N> zero = Zero(du16);
4191 const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
4192 const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
4193 const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
4194 const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
4195 sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
4196 return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
4197}
4198
4199// ------------------------------ Reductions
4200
4201namespace detail {
4202
4203// N=1 for any T: no-op
4204template <typename T>
4206 const Vec128<T, 1> v) {
4207 return v;
4208}
4209template <typename T>
4210HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
4211 const Vec128<T, 1> v) {
4212 return v;
4213}
4214template <typename T>
4215HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
4216 const Vec128<T, 1> v) {
4217 return v;
4218}
4219
4220// u32/i32/f32:
4221
4222// N=2
4223template <typename T>
4225 const Vec128<T, 2> v10) {
4226 return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
4227}
4228template <typename T>
4230 const Vec128<T, 2> v10) {
4231 return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
4232}
4233template <typename T>
4234HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
4235 const Vec128<T, 2> v10) {
4236 return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
4237}
4238
4239// N=4 (full)
4240template <typename T>
4242 const Vec128<T> v3210) {
4243 const Vec128<T> v1032 = Shuffle1032(v3210);
4244 const Vec128<T> v31_20_31_20 = v3210 + v1032;
4245 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4246 return v20_31_20_31 + v31_20_31_20;
4247}
4248template <typename T>
4250 const Vec128<T> v3210) {
4251 const Vec128<T> v1032 = Shuffle1032(v3210);
4252 const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
4253 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4254 return Min(v20_31_20_31, v31_20_31_20);
4255}
4256template <typename T>
4257HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
4258 const Vec128<T> v3210) {
4259 const Vec128<T> v1032 = Shuffle1032(v3210);
4260 const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
4261 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4262 return Max(v20_31_20_31, v31_20_31_20);
4263}
4264
4265// u64/i64/f64:
4266
4267// N=2 (full)
4268template <typename T>
4270 const Vec128<T> v10) {
4271 const Vec128<T> v01 = Shuffle01(v10);
4272 return v10 + v01;
4273}
4274template <typename T>
4276 const Vec128<T> v10) {
4277 const Vec128<T> v01 = Shuffle01(v10);
4278 return Min(v10, v01);
4279}
4280template <typename T>
4281HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
4282 const Vec128<T> v10) {
4283 const Vec128<T> v01 = Shuffle01(v10);
4284 return Max(v10, v01);
4285}
4286
4287// u16/i16
4288template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4289HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
4290 const DFromV<decltype(v)> d;
4291 const Repartition<int32_t, decltype(d)> d32;
4292 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4293 const auto odd = ShiftRight<16>(BitCast(d32, v));
4294 const auto min = MinOfLanes(d32, Min(even, odd));
4295 // Also broadcast into odd lanes.
4296 return BitCast(d, Or(min, ShiftLeft<16>(min)));
4297}
4298template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4299HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
4300 const DFromV<decltype(v)> d;
4301 const Repartition<int32_t, decltype(d)> d32;
4302 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4303 const auto odd = ShiftRight<16>(BitCast(d32, v));
4304 const auto min = MaxOfLanes(d32, Max(even, odd));
4305 // Also broadcast into odd lanes.
4306 return BitCast(d, Or(min, ShiftLeft<16>(min)));
4307}
4308
4309} // namespace detail
4310
4311// Supported for u/i/f 32/64. Returns the same value in each lane.
4312template <typename T, size_t N>
4313HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4314 return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4315}
4316template <typename T, size_t N>
4317HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4318 return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4319}
4320template <typename T, size_t N>
4321HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4322 return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4323}
4324
4325// ------------------------------ Lt128
4326
4327template <typename T, size_t N, HWY_IF_LE128(T, N)>
4328HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
4329 Vec128<T, N> b) {
4330 static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
4331 // Truth table of Eq and Lt for Hi and Lo u64.
4332 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
4333 // =H =L cH cL | out = cH | (=H & cL)
4334 // 0 0 0 0 | 0
4335 // 0 0 0 1 | 0
4336 // 0 0 1 0 | 1
4337 // 0 0 1 1 | 1
4338 // 0 1 0 0 | 0
4339 // 0 1 0 1 | 0
4340 // 0 1 1 0 | 1
4341 // 1 0 0 0 | 0
4342 // 1 0 0 1 | 1
4343 // 1 1 0 0 | 0
4344 const Mask128<T, N> eqHL = Eq(a, b);
4345 const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
4346 // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
4347 // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
4348 // comparison result leftwards requires only 4. IfThenElse compiles to the
4349 // same code as OrAnd().
4350 const Vec128<T, N> ltLx = DupEven(ltHL);
4351 const Vec128<T, N> outHx = IfThenElse(eqHL, ltLx, ltHL);
4352 return MaskFromVec(DupOdd(outHx));
4353}
4354
4355template <typename T, size_t N, HWY_IF_LE128(T, N)>
4356HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
4357 Vec128<T, N> b) {
4358 const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
4359 return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
4360}
4361
4362// ------------------------------ Min128, Max128 (Lt128)
4363
4364// Without a native OddEven, it seems infeasible to go faster than Lt128.
4365template <class D>
4366HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
4367 return IfThenElse(Lt128(d, a, b), a, b);
4368}
4369
4370template <class D>
4371HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
4372 return IfThenElse(Lt128(d, b, a), a, b);
4373}
4374
4375template <class D>
4376HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
4377 return IfThenElse(Lt128Upper(d, a, b), a, b);
4378}
4379
4380template <class D>
4381HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
4382 return IfThenElse(Lt128Upper(d, b, a), a, b);
4383}
4384
4385// ================================================== Operator wrapper
4386
4387template <class V>
4388HWY_API V Add(V a, V b) {
4389 return a + b;
4390}
4391template <class V>
4392HWY_API V Sub(V a, V b) {
4393 return a - b;
4394}
4395
4396template <class V>
4397HWY_API V Mul(V a, V b) {
4398 return a * b;
4399}
4400template <class V>
4401HWY_API V Div(V a, V b) {
4402 return a / b;
4403}
4404
4405template <class V>
4406V Shl(V a, V b) {
4407 return a << b;
4408}
4409template <class V>
4410V Shr(V a, V b) {
4411 return a >> b;
4412}
4413
4414template <class V>
4415HWY_API auto Eq(V a, V b) -> decltype(a == b) {
4416 return a == b;
4417}
4418template <class V>
4419HWY_API auto Ne(V a, V b) -> decltype(a == b) {
4420 return a != b;
4421}
4422template <class V>
4423HWY_API auto Lt(V a, V b) -> decltype(a == b) {
4424 return a < b;
4425}
4426
4427template <class V>
4428HWY_API auto Gt(V a, V b) -> decltype(a == b) {
4429 return a > b;
4430}
4431template <class V>
4432HWY_API auto Ge(V a, V b) -> decltype(a == b) {
4433 return a >= b;
4434}
4435
4436template <class V>
4437HWY_API auto Le(V a, V b) -> decltype(a == b) {
4438 return a <= b;
4439}
4440
4441// NOLINTNEXTLINE(google-readability-namespace-comments)
4442} // namespace HWY_NAMESPACE
4443} // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_IF_LE64(T, N)
Definition: base.h:333
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
detail::Raw128< T >::type raw
Definition: wasm_128-inl.h:106
Raw raw
Definition: arm_neon-inl.h:814
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: wasm_128-inl.h:75
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:761
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: wasm_128-inl.h:81
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: wasm_128-inl.h:90
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: wasm_128-inl.h:87
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: wasm_128-inl.h:72
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: wasm_128-inl.h:84
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: wasm_128-inl.h:78
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:2144
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:3476
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5491
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1700
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition: wasm_128-inl.h:1856
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:131
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5751
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5207
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5339
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:5187
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
Vec128< T, 4/sizeof(T)> Vec32
Definition: arm_neon-inl.h:800
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
Vec128< T, 8/sizeof(T)> Vec64
Definition: arm_neon-inl.h:797
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:511
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: arm_neon-inl.h:3883
__v128_u raw
Definition: wasm_128-inl.h:2521
Definition: ops/shared-inl.h:40
HWY_INLINE __f32x4 operator()(__v128_u v)
Definition: wasm_128-inl.h:151
HWY_INLINE __v128_u operator()(__v128_u v)
Definition: wasm_128-inl.h:147
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: wasm_128-inl.h:114
__f32x4 type
Definition: wasm_128-inl.h:60
Definition: x86_128-inl.h:55
__v128_u type
Definition: wasm_128-inl.h:56
Definition: base.h:358
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()