Grok 10.0.3
arm_neon-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 128-bit ARM64 NEON vectors and operations.
17// External include guard in highway.h - see comment there.
18
19// ARM NEON intrinsics are documented at:
20// https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]
21
22#include <arm_neon.h>
23#include <stddef.h>
24#include <stdint.h>
25
26#include "hwy/base.h"
27#include "hwy/ops/shared-inl.h"
28
30namespace hwy {
31namespace HWY_NAMESPACE {
32
33namespace detail { // for code folding and Raw128
34
35// Macros used to define single and double function calls for multiple types
36// for full and half vectors. These macros are undefined at the end of the file.
37
38// HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function.
39#define HWY_NEON_BUILD_TPL_1
40#define HWY_NEON_BUILD_TPL_2
41#define HWY_NEON_BUILD_TPL_3
42
43// HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can
44// extend it to int32x4x2_t packs.
45#define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
46#define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
47#define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>
48
49// HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives.
50#define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
51#define HWY_NEON_BUILD_PARAM_2(type, size) \
52 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
53#define HWY_NEON_BUILD_PARAM_3(type, size) \
54 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
55 const Vec128<type##_t, size> c
56
57// HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying
58// function.
59#define HWY_NEON_BUILD_ARG_1 a.raw
60#define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
61#define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
62
63// We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after
64// the __VA_ARGS__ have been expanded. This allows "func" to be a macro on
65// itself like with some of the library "functions" such as vshlq_u8. For
66// example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as
67// "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed.
68// Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro
69// expects two arguments.
70#define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
71
72// Main macro definition that defines a single function for the given type and
73// size of vector, using the underlying (prefix##infix##suffix) function and
74// the template, return type, parameters and arguments defined by the "args"
75// parameters passed here (see HWY_NEON_BUILD_* macros defined before).
76#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
77 HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \
78 HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \
79 name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
80 return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
81 HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
82 }
83
84// The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function
85// called "name" using the set of neon functions starting with the given
86// "prefix" for all the variants of certain types, as specified next to each
87// macro. For example, the prefix "vsub" can be used to define the operator-
88// using args=2.
89
90// uint8_t
91#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
92 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
93 HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \
94 HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \
95 HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \
96 HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)
97
98// int8_t
99#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
100 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
101 HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \
102 HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \
103 HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \
104 HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)
105
106// uint16_t
107#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
108 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
109 HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \
110 HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \
111 HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)
112
113// int16_t
114#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
115 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
116 HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \
117 HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \
118 HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)
119
120// uint32_t
121#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
122 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
123 HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \
124 HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)
125
126// int32_t
127#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
128 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
129 HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \
130 HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)
131
132// uint64_t
133#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
134 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
135 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
136
137// int64_t
138#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
139 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
140 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
141
142// float
143#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
144 HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
145 HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \
146 HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)
147
148// double
149#if HWY_ARCH_ARM_A64
150#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \
151 HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
152 HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
153#else
154#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
155#endif
156
157// float and double
158
159#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
160 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
161 HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
162
163// Helper macros to define for more than one type.
164// uint8_t, uint16_t and uint32_t
165#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
166 HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
167 HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
168 HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
169
170// int8_t, int16_t and int32_t
171#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
172 HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
173 HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
174 HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
175
176// uint8_t, uint16_t, uint32_t and uint64_t
177#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
178 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
179 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
180
181// int8_t, int16_t, int32_t and int64_t
182#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
183 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
184 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
185
186// All int*_t and uint*_t up to 64
187#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
188 HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
189 HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
190
191// All previous types.
192#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
193 HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
194 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
195
196#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args) \
197 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
198 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
199 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
200
201// Emulation of some intrinsics on armv7.
202#if HWY_ARCH_ARM_V7
203#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
204#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
205#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
206#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
207#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
208#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
209#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
210#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
211#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
212#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
213#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
214#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
215#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
216#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
217#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
218#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
219#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
220#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
221#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
222#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
223#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
224#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
225#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
226#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
227#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
228#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
229#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
230#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
231#define vzip1_s8(x, y) vzip_s8(x, y).val[0]
232#define vzip1_u8(x, y) vzip_u8(x, y).val[0]
233#define vzip1_s16(x, y) vzip_s16(x, y).val[0]
234#define vzip1_u16(x, y) vzip_u16(x, y).val[0]
235#define vzip1_f32(x, y) vzip_f32(x, y).val[0]
236#define vzip1_u32(x, y) vzip_u32(x, y).val[0]
237#define vzip1_s32(x, y) vzip_s32(x, y).val[0]
238#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
239#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
240#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
241#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
242#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
243#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
244#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
245#define vzip2_s8(x, y) vzip_s8(x, y).val[1]
246#define vzip2_u8(x, y) vzip_u8(x, y).val[1]
247#define vzip2_s16(x, y) vzip_s16(x, y).val[1]
248#define vzip2_u16(x, y) vzip_u16(x, y).val[1]
249#define vzip2_s32(x, y) vzip_s32(x, y).val[1]
250#define vzip2_u32(x, y) vzip_u32(x, y).val[1]
251#define vzip2_f32(x, y) vzip_f32(x, y).val[1]
252#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
253#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
254#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
255#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
256#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
257#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
258#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
259#endif
260
261// Wrappers over uint8x16x2_t etc. so we can define StoreInterleaved2 overloads
262// for all vector types, even those (bfloat16_t) where the underlying vector is
263// the same as others (uint16_t).
264template <typename T, size_t N>
265struct Tuple2;
266template <typename T, size_t N>
267struct Tuple3;
268template <typename T, size_t N>
269struct Tuple4;
270
271template <>
272struct Tuple2<uint8_t, 16> {
273 uint8x16x2_t raw;
274};
275template <size_t N>
276struct Tuple2<uint8_t, N> {
277 uint8x8x2_t raw;
278};
279template <>
280struct Tuple2<int8_t, 16> {
281 int8x16x2_t raw;
282};
283template <size_t N>
284struct Tuple2<int8_t, N> {
285 int8x8x2_t raw;
286};
287template <>
288struct Tuple2<uint16_t, 8> {
289 uint16x8x2_t raw;
290};
291template <size_t N>
292struct Tuple2<uint16_t, N> {
293 uint16x4x2_t raw;
294};
295template <>
296struct Tuple2<int16_t, 8> {
297 int16x8x2_t raw;
298};
299template <size_t N>
300struct Tuple2<int16_t, N> {
301 int16x4x2_t raw;
302};
303template <>
304struct Tuple2<uint32_t, 4> {
305 uint32x4x2_t raw;
306};
307template <size_t N>
308struct Tuple2<uint32_t, N> {
309 uint32x2x2_t raw;
310};
311template <>
312struct Tuple2<int32_t, 4> {
313 int32x4x2_t raw;
314};
315template <size_t N>
316struct Tuple2<int32_t, N> {
317 int32x2x2_t raw;
318};
319template <>
320struct Tuple2<uint64_t, 2> {
321 uint64x2x2_t raw;
322};
323template <size_t N>
324struct Tuple2<uint64_t, N> {
325 uint64x1x2_t raw;
326};
327template <>
328struct Tuple2<int64_t, 2> {
329 int64x2x2_t raw;
330};
331template <size_t N>
332struct Tuple2<int64_t, N> {
333 int64x1x2_t raw;
334};
335
336template <>
337struct Tuple2<float16_t, 8> {
338 uint16x8x2_t raw;
339};
340template <size_t N>
342 uint16x4x2_t raw;
343};
344template <>
345struct Tuple2<bfloat16_t, 8> {
346 uint16x8x2_t raw;
347};
348template <size_t N>
350 uint16x4x2_t raw;
351};
352
353template <>
354struct Tuple2<float32_t, 4> {
355 float32x4x2_t raw;
356};
357template <size_t N>
359 float32x2x2_t raw;
360};
361#if HWY_ARCH_ARM_A64
362template <>
363struct Tuple2<float64_t, 2> {
364 float64x2x2_t raw;
365};
366template <size_t N>
367struct Tuple2<float64_t, N> {
368 float64x1x2_t raw;
369};
370#endif // HWY_ARCH_ARM_A64
371
372template <>
373struct Tuple3<uint8_t, 16> {
374 uint8x16x3_t raw;
375};
376template <size_t N>
377struct Tuple3<uint8_t, N> {
378 uint8x8x3_t raw;
379};
380template <>
381struct Tuple3<int8_t, 16> {
382 int8x16x3_t raw;
383};
384template <size_t N>
385struct Tuple3<int8_t, N> {
386 int8x8x3_t raw;
387};
388template <>
389struct Tuple3<uint16_t, 8> {
390 uint16x8x3_t raw;
391};
392template <size_t N>
393struct Tuple3<uint16_t, N> {
394 uint16x4x3_t raw;
395};
396template <>
397struct Tuple3<int16_t, 8> {
398 int16x8x3_t raw;
399};
400template <size_t N>
401struct Tuple3<int16_t, N> {
402 int16x4x3_t raw;
403};
404template <>
405struct Tuple3<uint32_t, 4> {
406 uint32x4x3_t raw;
407};
408template <size_t N>
409struct Tuple3<uint32_t, N> {
410 uint32x2x3_t raw;
411};
412template <>
413struct Tuple3<int32_t, 4> {
414 int32x4x3_t raw;
415};
416template <size_t N>
417struct Tuple3<int32_t, N> {
418 int32x2x3_t raw;
419};
420template <>
421struct Tuple3<uint64_t, 2> {
422 uint64x2x3_t raw;
423};
424template <size_t N>
425struct Tuple3<uint64_t, N> {
426 uint64x1x3_t raw;
427};
428template <>
429struct Tuple3<int64_t, 2> {
430 int64x2x3_t raw;
431};
432template <size_t N>
433struct Tuple3<int64_t, N> {
434 int64x1x3_t raw;
435};
436
437template <>
438struct Tuple3<float16_t, 8> {
439 uint16x8x3_t raw;
440};
441template <size_t N>
443 uint16x4x3_t raw;
444};
445template <>
446struct Tuple3<bfloat16_t, 8> {
447 uint16x8x3_t raw;
448};
449template <size_t N>
451 uint16x4x3_t raw;
452};
453
454template <>
455struct Tuple3<float32_t, 4> {
456 float32x4x3_t raw;
457};
458template <size_t N>
460 float32x2x3_t raw;
461};
462#if HWY_ARCH_ARM_A64
463template <>
464struct Tuple3<float64_t, 2> {
465 float64x2x3_t raw;
466};
467template <size_t N>
468struct Tuple3<float64_t, N> {
469 float64x1x3_t raw;
470};
471#endif // HWY_ARCH_ARM_A64
472
473template <>
474struct Tuple4<uint8_t, 16> {
475 uint8x16x4_t raw;
476};
477template <size_t N>
478struct Tuple4<uint8_t, N> {
479 uint8x8x4_t raw;
480};
481template <>
482struct Tuple4<int8_t, 16> {
483 int8x16x4_t raw;
484};
485template <size_t N>
486struct Tuple4<int8_t, N> {
487 int8x8x4_t raw;
488};
489template <>
490struct Tuple4<uint16_t, 8> {
491 uint16x8x4_t raw;
492};
493template <size_t N>
494struct Tuple4<uint16_t, N> {
495 uint16x4x4_t raw;
496};
497template <>
498struct Tuple4<int16_t, 8> {
499 int16x8x4_t raw;
500};
501template <size_t N>
502struct Tuple4<int16_t, N> {
503 int16x4x4_t raw;
504};
505template <>
506struct Tuple4<uint32_t, 4> {
507 uint32x4x4_t raw;
508};
509template <size_t N>
510struct Tuple4<uint32_t, N> {
511 uint32x2x4_t raw;
512};
513template <>
514struct Tuple4<int32_t, 4> {
515 int32x4x4_t raw;
516};
517template <size_t N>
518struct Tuple4<int32_t, N> {
519 int32x2x4_t raw;
520};
521template <>
522struct Tuple4<uint64_t, 2> {
523 uint64x2x4_t raw;
524};
525template <size_t N>
526struct Tuple4<uint64_t, N> {
527 uint64x1x4_t raw;
528};
529template <>
530struct Tuple4<int64_t, 2> {
531 int64x2x4_t raw;
532};
533template <size_t N>
534struct Tuple4<int64_t, N> {
535 int64x1x4_t raw;
536};
537
538template <>
539struct Tuple4<float16_t, 8> {
540 uint16x8x4_t raw;
541};
542template <size_t N>
544 uint16x4x4_t raw;
545};
546template <>
547struct Tuple4<bfloat16_t, 8> {
548 uint16x8x4_t raw;
549};
550template <size_t N>
552 uint16x4x4_t raw;
553};
554
555template <>
556struct Tuple4<float32_t, 4> {
557 float32x4x4_t raw;
558};
559template <size_t N>
561 float32x2x4_t raw;
562};
563#if HWY_ARCH_ARM_A64
564template <>
565struct Tuple4<float64_t, 2> {
566 float64x2x4_t raw;
567};
568template <size_t N>
569struct Tuple4<float64_t, N> {
570 float64x1x4_t raw;
571};
572#endif // HWY_ARCH_ARM_A64
573
574template <typename T, size_t N>
575struct Raw128;
576
577// 128
578template <>
579struct Raw128<uint8_t, 16> {
580 using type = uint8x16_t;
581};
582
583template <>
584struct Raw128<uint16_t, 8> {
585 using type = uint16x8_t;
586};
587
588template <>
589struct Raw128<uint32_t, 4> {
590 using type = uint32x4_t;
591};
592
593template <>
594struct Raw128<uint64_t, 2> {
595 using type = uint64x2_t;
596};
597
598template <>
599struct Raw128<int8_t, 16> {
600 using type = int8x16_t;
601};
602
603template <>
604struct Raw128<int16_t, 8> {
605 using type = int16x8_t;
606};
607
608template <>
609struct Raw128<int32_t, 4> {
610 using type = int32x4_t;
611};
612
613template <>
614struct Raw128<int64_t, 2> {
615 using type = int64x2_t;
616};
617
618template <>
619struct Raw128<float16_t, 8> {
620 using type = uint16x8_t;
621};
622
623template <>
624struct Raw128<bfloat16_t, 8> {
625 using type = uint16x8_t;
626};
627
628template <>
629struct Raw128<float, 4> {
630 using type = float32x4_t;
631};
632
633#if HWY_ARCH_ARM_A64
634template <>
635struct Raw128<double, 2> {
636 using type = float64x2_t;
637};
638#endif
639
640// 64
641template <>
642struct Raw128<uint8_t, 8> {
643 using type = uint8x8_t;
644};
645
646template <>
647struct Raw128<uint16_t, 4> {
648 using type = uint16x4_t;
649};
650
651template <>
652struct Raw128<uint32_t, 2> {
653 using type = uint32x2_t;
654};
655
656template <>
657struct Raw128<uint64_t, 1> {
658 using type = uint64x1_t;
659};
660
661template <>
662struct Raw128<int8_t, 8> {
663 using type = int8x8_t;
664};
665
666template <>
667struct Raw128<int16_t, 4> {
668 using type = int16x4_t;
669};
670
671template <>
672struct Raw128<int32_t, 2> {
673 using type = int32x2_t;
674};
675
676template <>
677struct Raw128<int64_t, 1> {
678 using type = int64x1_t;
679};
680
681template <>
682struct Raw128<float16_t, 4> {
683 using type = uint16x4_t;
684};
685
686template <>
687struct Raw128<bfloat16_t, 4> {
688 using type = uint16x4_t;
689};
690
691template <>
692struct Raw128<float, 2> {
693 using type = float32x2_t;
694};
695
696#if HWY_ARCH_ARM_A64
697template <>
698struct Raw128<double, 1> {
699 using type = float64x1_t;
700};
701#endif
702
703// 32 (same as 64)
704template <>
705struct Raw128<uint8_t, 4> : public Raw128<uint8_t, 8> {};
706
707template <>
708struct Raw128<uint16_t, 2> : public Raw128<uint16_t, 4> {};
709
710template <>
711struct Raw128<uint32_t, 1> : public Raw128<uint32_t, 2> {};
712
713template <>
714struct Raw128<int8_t, 4> : public Raw128<int8_t, 8> {};
715
716template <>
717struct Raw128<int16_t, 2> : public Raw128<int16_t, 4> {};
718
719template <>
720struct Raw128<int32_t, 1> : public Raw128<int32_t, 2> {};
721
722template <>
723struct Raw128<float16_t, 2> : public Raw128<float16_t, 4> {};
724
725template <>
726struct Raw128<bfloat16_t, 2> : public Raw128<bfloat16_t, 4> {};
727
728template <>
729struct Raw128<float, 1> : public Raw128<float, 2> {};
730
731// 16 (same as 64)
732template <>
733struct Raw128<uint8_t, 2> : public Raw128<uint8_t, 8> {};
734
735template <>
736struct Raw128<uint16_t, 1> : public Raw128<uint16_t, 4> {};
737
738template <>
739struct Raw128<int8_t, 2> : public Raw128<int8_t, 8> {};
740
741template <>
742struct Raw128<int16_t, 1> : public Raw128<int16_t, 4> {};
743
744template <>
745struct Raw128<float16_t, 1> : public Raw128<float16_t, 4> {};
746
747template <>
748struct Raw128<bfloat16_t, 1> : public Raw128<bfloat16_t, 4> {};
749
750// 8 (same as 64)
751template <>
752struct Raw128<uint8_t, 1> : public Raw128<uint8_t, 8> {};
753
754template <>
755struct Raw128<int8_t, 1> : public Raw128<int8_t, 8> {};
756
757} // namespace detail
758
759template <typename T, size_t N = 16 / sizeof(T)>
760class Vec128 {
762
763 public:
765 Vec128(const Vec128&) = default;
766 Vec128& operator=(const Vec128&) = default;
767 HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}
768
769 // Compound assignment. Only usable if there is a corresponding non-member
770 // binary operator overload. For example, only f32 and f64 support division.
772 return *this = (*this * other);
773 }
775 return *this = (*this / other);
776 }
778 return *this = (*this + other);
779 }
781 return *this = (*this - other);
782 }
784 return *this = (*this & other);
785 }
787 return *this = (*this | other);
788 }
790 return *this = (*this ^ other);
791 }
792
794};
795
796template <typename T>
797using Vec64 = Vec128<T, 8 / sizeof(T)>;
798
799template <typename T>
800using Vec32 = Vec128<T, 4 / sizeof(T)>;
801
802// FF..FF or 0.
803template <typename T, size_t N = 16 / sizeof(T)>
804class Mask128 {
805 // ARM C Language Extensions return and expect unsigned type.
806 using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
807
808 public:
810 Mask128(const Mask128&) = default;
811 Mask128& operator=(const Mask128&) = default;
812 HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {}
813
815};
816
817template <typename T>
818using Mask64 = Mask128<T, 8 / sizeof(T)>;
819
820namespace detail {
821
822// Deduce Simd<T, N, 0> from Vec128<T, N>
823struct DeduceD {
824 template <typename T, size_t N>
826 return Simd<T, N, 0>();
827 }
828};
829
830} // namespace detail
831
832template <class V>
833using DFromV = decltype(detail::DeduceD()(V()));
834
835template <class V>
837
838// ------------------------------ BitCast
839
840namespace detail {
841
842// Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
843// vreinterpret*_u8_*() set of functions.
844#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
845#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
846 Vec128<uint8_t, size * sizeof(type##_t)>
847#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
848#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
849
850// Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
851template <size_t N>
853 return v;
854}
855
857 HWY_CAST_TO_U8)
858HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
859HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
860HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
861HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
862
863// Special cases for [b]float16_t, which have the same Raw as uint16_t.
864template <size_t N>
867}
868template <size_t N>
871}
872
873#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
874#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
875#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
876#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
877
878template <size_t N>
881 return v;
882}
883
884// 64-bit or less:
885
886template <size_t N, HWY_IF_LE64(int8_t, N)>
889 return Vec128<int8_t, N>(vreinterpret_s8_u8(v.raw));
890}
891template <size_t N, HWY_IF_LE64(uint16_t, N)>
894 return Vec128<uint16_t, N>(vreinterpret_u16_u8(v.raw));
895}
896template <size_t N, HWY_IF_LE64(int16_t, N)>
899 return Vec128<int16_t, N>(vreinterpret_s16_u8(v.raw));
900}
901template <size_t N, HWY_IF_LE64(uint32_t, N)>
904 return Vec128<uint32_t, N>(vreinterpret_u32_u8(v.raw));
905}
906template <size_t N, HWY_IF_LE64(int32_t, N)>
909 return Vec128<int32_t, N>(vreinterpret_s32_u8(v.raw));
910}
911template <size_t N, HWY_IF_LE64(float, N)>
914 return Vec128<float, N>(vreinterpret_f32_u8(v.raw));
915}
918 return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw));
919}
922 return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
923}
924#if HWY_ARCH_ARM_A64
927 return Vec64<double>(vreinterpret_f64_u8(v.raw));
928}
929#endif
930
931// 128-bit full:
932
935 return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
936}
939 return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
940}
943 return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
944}
947 return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
948}
951 return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
952}
955 return Vec128<float>(vreinterpretq_f32_u8(v.raw));
956}
959 return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
960}
963 return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
964}
965
966#if HWY_ARCH_ARM_A64
969 return Vec128<double>(vreinterpretq_f64_u8(v.raw));
970}
971#endif
972
973// Special cases for [b]float16_t, which have the same Raw as uint16_t.
974template <size_t N>
978}
979template <size_t N>
983}
984
985} // namespace detail
986
987template <typename T, size_t N, typename FromT>
989 Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
991}
992
993// ------------------------------ Set
994
995// Returns a vector with all lanes set to "t".
996#define HWY_NEON_BUILD_TPL_HWY_SET1
997#define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type##_t, size>
998#define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \
999 Simd<type##_t, size, 0> /* tag */, const type##_t t
1000#define HWY_NEON_BUILD_ARG_HWY_SET1 t
1001
1002HWY_NEON_DEF_FUNCTION_ALL_TYPES(Set, vdup, _n_, HWY_SET1)
1003
1004#undef HWY_NEON_BUILD_TPL_HWY_SET1
1005#undef HWY_NEON_BUILD_RET_HWY_SET1
1006#undef HWY_NEON_BUILD_PARAM_HWY_SET1
1007#undef HWY_NEON_BUILD_ARG_HWY_SET1
1008
1009// Returns an all-zero vector.
1010template <typename T, size_t N>
1012 return Set(d, 0);
1013}
1014
1015template <size_t N>
1018}
1019
1020template <class D>
1021using VFromD = decltype(Zero(D()));
1022
1023// Returns a vector with uninitialized elements.
1024template <typename T, size_t N>
1026 HWY_DIAGNOSTICS(push)
1027 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
1028 typename detail::Raw128<T, N>::type a;
1029 return Vec128<T, N>(a);
1030 HWY_DIAGNOSTICS(pop)
1031}
1032
1033// Returns a vector with lane i=[0, N) set to "first" + i.
1034template <typename T, size_t N, typename T2>
1035Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
1036 HWY_ALIGN T lanes[16 / sizeof(T)];
1037 for (size_t i = 0; i < 16 / sizeof(T); ++i) {
1038 lanes[i] = static_cast<T>(first + static_cast<T2>(i));
1039 }
1040 return Load(d, lanes);
1041}
1042
1043// ------------------------------ GetLane
1044
1045namespace detail {
1046#define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane>
1047#define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t
1048#define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v
1049#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
1050
1051HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET)
1052
1053#undef HWY_NEON_BUILD_TPL_HWY_GET
1054#undef HWY_NEON_BUILD_RET_HWY_GET
1055#undef HWY_NEON_BUILD_PARAM_HWY_GET
1056#undef HWY_NEON_BUILD_ARG_HWY_GET
1057
1058} // namespace detail
1059
1060template <class V>
1062 return detail::GetLane<0>(v);
1063}
1064
1065// ------------------------------ ExtractLane
1066
1067// Requires one overload per vector length because GetLane<3> is a compile error
1068// if v is a uint32x2_t.
1069template <typename T>
1070HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
1071 HWY_DASSERT(i == 0);
1072 (void)i;
1073 return detail::GetLane<0>(v);
1074}
1075
1076template <typename T>
1077HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
1078#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1079 if (__builtin_constant_p(i)) {
1080 switch (i) {
1081 case 0:
1082 return detail::GetLane<0>(v);
1083 case 1:
1084 return detail::GetLane<1>(v);
1085 }
1086 }
1087#endif
1088 alignas(16) T lanes[2];
1089 Store(v, DFromV<decltype(v)>(), lanes);
1090 return lanes[i];
1091}
1092
1093template <typename T>
1094HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
1095#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1096 if (__builtin_constant_p(i)) {
1097 switch (i) {
1098 case 0:
1099 return detail::GetLane<0>(v);
1100 case 1:
1101 return detail::GetLane<1>(v);
1102 case 2:
1103 return detail::GetLane<2>(v);
1104 case 3:
1105 return detail::GetLane<3>(v);
1106 }
1107 }
1108#endif
1109 alignas(16) T lanes[4];
1110 Store(v, DFromV<decltype(v)>(), lanes);
1111 return lanes[i];
1112}
1113
1114template <typename T>
1115HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
1116#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1117 if (__builtin_constant_p(i)) {
1118 switch (i) {
1119 case 0:
1120 return detail::GetLane<0>(v);
1121 case 1:
1122 return detail::GetLane<1>(v);
1123 case 2:
1124 return detail::GetLane<2>(v);
1125 case 3:
1126 return detail::GetLane<3>(v);
1127 case 4:
1128 return detail::GetLane<4>(v);
1129 case 5:
1130 return detail::GetLane<5>(v);
1131 case 6:
1132 return detail::GetLane<6>(v);
1133 case 7:
1134 return detail::GetLane<7>(v);
1135 }
1136 }
1137#endif
1138 alignas(16) T lanes[8];
1139 Store(v, DFromV<decltype(v)>(), lanes);
1140 return lanes[i];
1141}
1142
1143template <typename T>
1145#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1146 if (__builtin_constant_p(i)) {
1147 switch (i) {
1148 case 0:
1149 return detail::GetLane<0>(v);
1150 case 1:
1151 return detail::GetLane<1>(v);
1152 case 2:
1153 return detail::GetLane<2>(v);
1154 case 3:
1155 return detail::GetLane<3>(v);
1156 case 4:
1157 return detail::GetLane<4>(v);
1158 case 5:
1159 return detail::GetLane<5>(v);
1160 case 6:
1161 return detail::GetLane<6>(v);
1162 case 7:
1163 return detail::GetLane<7>(v);
1164 case 8:
1165 return detail::GetLane<8>(v);
1166 case 9:
1167 return detail::GetLane<9>(v);
1168 case 10:
1169 return detail::GetLane<10>(v);
1170 case 11:
1171 return detail::GetLane<11>(v);
1172 case 12:
1173 return detail::GetLane<12>(v);
1174 case 13:
1175 return detail::GetLane<13>(v);
1176 case 14:
1177 return detail::GetLane<14>(v);
1178 case 15:
1179 return detail::GetLane<15>(v);
1180 }
1181 }
1182#endif
1183 alignas(16) T lanes[16];
1184 Store(v, DFromV<decltype(v)>(), lanes);
1185 return lanes[i];
1186}
1187
1188// ------------------------------ InsertLane
1189
1190namespace detail {
1191#define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane>
1192#define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size>
1193#define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \
1194 Vec128<type##_t, size> v, type##_t t
1195#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
1196
1197HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)
1198
1199#undef HWY_NEON_BUILD_TPL_HWY_INSERT
1200#undef HWY_NEON_BUILD_RET_HWY_INSERT
1201#undef HWY_NEON_BUILD_PARAM_HWY_INSERT
1202#undef HWY_NEON_BUILD_ARG_HWY_INSERT
1203
1204} // namespace detail
1205
1206// Requires one overload per vector length because InsertLane<3> may be a
1207// compile error.
1208
1209template <typename T>
1211 HWY_DASSERT(i == 0);
1212 (void)i;
1213 return Set(DFromV<decltype(v)>(), t);
1214}
1215
1216template <typename T>
1218#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1219 if (__builtin_constant_p(i)) {
1220 switch (i) {
1221 case 0:
1222 return detail::InsertLane<0>(v, t);
1223 case 1:
1224 return detail::InsertLane<1>(v, t);
1225 }
1226 }
1227#endif
1228 const DFromV<decltype(v)> d;
1229 alignas(16) T lanes[2];
1230 Store(v, d, lanes);
1231 lanes[i] = t;
1232 return Load(d, lanes);
1233}
1234
1235template <typename T>
1237#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1238 if (__builtin_constant_p(i)) {
1239 switch (i) {
1240 case 0:
1241 return detail::InsertLane<0>(v, t);
1242 case 1:
1243 return detail::InsertLane<1>(v, t);
1244 case 2:
1245 return detail::InsertLane<2>(v, t);
1246 case 3:
1247 return detail::InsertLane<3>(v, t);
1248 }
1249 }
1250#endif
1251 const DFromV<decltype(v)> d;
1252 alignas(16) T lanes[4];
1253 Store(v, d, lanes);
1254 lanes[i] = t;
1255 return Load(d, lanes);
1256}
1257
1258template <typename T>
1260#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1261 if (__builtin_constant_p(i)) {
1262 switch (i) {
1263 case 0:
1264 return detail::InsertLane<0>(v, t);
1265 case 1:
1266 return detail::InsertLane<1>(v, t);
1267 case 2:
1268 return detail::InsertLane<2>(v, t);
1269 case 3:
1270 return detail::InsertLane<3>(v, t);
1271 case 4:
1272 return detail::InsertLane<4>(v, t);
1273 case 5:
1274 return detail::InsertLane<5>(v, t);
1275 case 6:
1276 return detail::InsertLane<6>(v, t);
1277 case 7:
1278 return detail::InsertLane<7>(v, t);
1279 }
1280 }
1281#endif
1282 const DFromV<decltype(v)> d;
1283 alignas(16) T lanes[8];
1284 Store(v, d, lanes);
1285 lanes[i] = t;
1286 return Load(d, lanes);
1287}
1288
1289template <typename T>
1291#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1292 if (__builtin_constant_p(i)) {
1293 switch (i) {
1294 case 0:
1295 return detail::InsertLane<0>(v, t);
1296 case 1:
1297 return detail::InsertLane<1>(v, t);
1298 case 2:
1299 return detail::InsertLane<2>(v, t);
1300 case 3:
1301 return detail::InsertLane<3>(v, t);
1302 case 4:
1303 return detail::InsertLane<4>(v, t);
1304 case 5:
1305 return detail::InsertLane<5>(v, t);
1306 case 6:
1307 return detail::InsertLane<6>(v, t);
1308 case 7:
1309 return detail::InsertLane<7>(v, t);
1310 case 8:
1311 return detail::InsertLane<8>(v, t);
1312 case 9:
1313 return detail::InsertLane<9>(v, t);
1314 case 10:
1315 return detail::InsertLane<10>(v, t);
1316 case 11:
1317 return detail::InsertLane<11>(v, t);
1318 case 12:
1319 return detail::InsertLane<12>(v, t);
1320 case 13:
1321 return detail::InsertLane<13>(v, t);
1322 case 14:
1323 return detail::InsertLane<14>(v, t);
1324 case 15:
1325 return detail::InsertLane<15>(v, t);
1326 }
1327 }
1328#endif
1329 const DFromV<decltype(v)> d;
1330 alignas(16) T lanes[16];
1331 Store(v, d, lanes);
1332 lanes[i] = t;
1333 return Load(d, lanes);
1334}
1335
1336// ================================================== ARITHMETIC
1337
1338// ------------------------------ Addition
1339HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2)
1340
1341// ------------------------------ Subtraction
1342HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2)
1343
1344// ------------------------------ SumsOf8
1345
1346HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) {
1347 return Vec128<uint64_t>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw))));
1348}
1350 return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw))));
1351}
1352
1353// ------------------------------ SaturatedAdd
1354// Only defined for uint8_t, uint16_t and their signed versions, as in other
1355// architectures.
1356
1357// Returns a + b clamped to the destination range.
1362
1363// ------------------------------ SaturatedSub
1364
1365// Returns a - b clamped to the destination range.
1370
1371// Not part of API, used in implementation.
1372namespace detail {
1377} // namespace detail
1378
1379// ------------------------------ Average
1380
1381// Returns (a + b + 1) / 2
1384
1385// ------------------------------ Neg
1386
1388HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below
1389
1390HWY_API Vec64<int64_t> Neg(const Vec64<int64_t> v) {
1391#if HWY_ARCH_ARM_A64
1392 return Vec64<int64_t>(vneg_s64(v.raw));
1393#else
1394 return Zero(Full64<int64_t>()) - v;
1395#endif
1396}
1397
1399#if HWY_ARCH_ARM_A64
1400 return Vec128<int64_t>(vnegq_s64(v.raw));
1401#else
1402 return Zero(Full128<int64_t>()) - v;
1403#endif
1404}
1405
1406// ------------------------------ ShiftLeft
1407
1408// Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
1409#pragma push_macro("HWY_NEON_DEF_FUNCTION")
1410#undef HWY_NEON_DEF_FUNCTION
1411#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
1412 template <int kBits> \
1413 HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) { \
1414 return kBits == 0 ? v \
1415 : Vec128<type##_t, size>(HWY_NEON_EVAL( \
1416 prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
1417 }
1418
1420
1421HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored)
1422HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored)
1423
1424#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
1425
1426// ------------------------------ RotateRight (ShiftRight, Or)
1427
1428template <int kBits, size_t N>
1430 static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
1431 if (kBits == 0) return v;
1432 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
1433}
1434
1435template <int kBits, size_t N>
1437 static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
1438 if (kBits == 0) return v;
1439 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
1440}
1441
1442// NOTE: vxarq_u64 can be applied to uint64_t, but we do not yet have a
1443// mechanism for checking for extensions to ARMv8.
1444
1445// ------------------------------ Shl
1446
1448 const Vec128<uint8_t> bits) {
1449 return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
1450}
1451template <size_t N, HWY_IF_LE64(uint8_t, N)>
1453 const Vec128<uint8_t, N> bits) {
1454 return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
1455}
1456
1458 const Vec128<uint16_t> bits) {
1459 return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
1460}
1461template <size_t N, HWY_IF_LE64(uint16_t, N)>
1463 const Vec128<uint16_t, N> bits) {
1464 return Vec128<uint16_t, N>(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw)));
1465}
1466
1468 const Vec128<uint32_t> bits) {
1469 return Vec128<uint32_t>(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw)));
1470}
1471template <size_t N, HWY_IF_LE64(uint32_t, N)>
1473 const Vec128<uint32_t, N> bits) {
1474 return Vec128<uint32_t, N>(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw)));
1475}
1476
1478 const Vec128<uint64_t> bits) {
1479 return Vec128<uint64_t>(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw)));
1480}
1482 const Vec64<uint64_t> bits) {
1483 return Vec64<uint64_t>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
1484}
1485
1487 const Vec128<int8_t> bits) {
1488 return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
1489}
1490template <size_t N, HWY_IF_LE64(int8_t, N)>
1492 const Vec128<int8_t, N> bits) {
1493 return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw));
1494}
1495
1497 const Vec128<int16_t> bits) {
1498 return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw));
1499}
1500template <size_t N, HWY_IF_LE64(int16_t, N)>
1502 const Vec128<int16_t, N> bits) {
1503 return Vec128<int16_t, N>(vshl_s16(v.raw, bits.raw));
1504}
1505
1507 const Vec128<int32_t> bits) {
1508 return Vec128<int32_t>(vshlq_s32(v.raw, bits.raw));
1509}
1510template <size_t N, HWY_IF_LE64(int32_t, N)>
1512 const Vec128<int32_t, N> bits) {
1513 return Vec128<int32_t, N>(vshl_s32(v.raw, bits.raw));
1514}
1515
1517 const Vec128<int64_t> bits) {
1518 return Vec128<int64_t>(vshlq_s64(v.raw, bits.raw));
1519}
1521 const Vec64<int64_t> bits) {
1522 return Vec64<int64_t>(vshl_s64(v.raw, bits.raw));
1523}
1524
1525// ------------------------------ Shr (Neg)
1526
1528 const Vec128<uint8_t> bits) {
1529 const int8x16_t neg_bits = Neg(BitCast(Full128<int8_t>(), bits)).raw;
1530 return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
1531}
1532template <size_t N, HWY_IF_LE64(uint8_t, N)>
1534 const Vec128<uint8_t, N> bits) {
1535 const int8x8_t neg_bits = Neg(BitCast(Simd<int8_t, N, 0>(), bits)).raw;
1536 return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
1537}
1538
1540 const Vec128<uint16_t> bits) {
1541 const int16x8_t neg_bits = Neg(BitCast(Full128<int16_t>(), bits)).raw;
1542 return Vec128<uint16_t>(vshlq_u16(v.raw, neg_bits));
1543}
1544template <size_t N, HWY_IF_LE64(uint16_t, N)>
1546 const Vec128<uint16_t, N> bits) {
1547 const int16x4_t neg_bits = Neg(BitCast(Simd<int16_t, N, 0>(), bits)).raw;
1548 return Vec128<uint16_t, N>(vshl_u16(v.raw, neg_bits));
1549}
1550
1552 const Vec128<uint32_t> bits) {
1553 const int32x4_t neg_bits = Neg(BitCast(Full128<int32_t>(), bits)).raw;
1554 return Vec128<uint32_t>(vshlq_u32(v.raw, neg_bits));
1555}
1556template <size_t N, HWY_IF_LE64(uint32_t, N)>
1558 const Vec128<uint32_t, N> bits) {
1559 const int32x2_t neg_bits = Neg(BitCast(Simd<int32_t, N, 0>(), bits)).raw;
1560 return Vec128<uint32_t, N>(vshl_u32(v.raw, neg_bits));
1561}
1562
1564 const Vec128<uint64_t> bits) {
1565 const int64x2_t neg_bits = Neg(BitCast(Full128<int64_t>(), bits)).raw;
1566 return Vec128<uint64_t>(vshlq_u64(v.raw, neg_bits));
1567}
1569 const Vec64<uint64_t> bits) {
1570 const int64x1_t neg_bits = Neg(BitCast(Full64<int64_t>(), bits)).raw;
1571 return Vec64<uint64_t>(vshl_u64(v.raw, neg_bits));
1572}
1573
1575 const Vec128<int8_t> bits) {
1576 return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
1577}
1578template <size_t N, HWY_IF_LE64(int8_t, N)>
1580 const Vec128<int8_t, N> bits) {
1581 return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
1582}
1583
1585 const Vec128<int16_t> bits) {
1586 return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
1587}
1588template <size_t N, HWY_IF_LE64(int16_t, N)>
1590 const Vec128<int16_t, N> bits) {
1591 return Vec128<int16_t, N>(vshl_s16(v.raw, Neg(bits).raw));
1592}
1593
1595 const Vec128<int32_t> bits) {
1596 return Vec128<int32_t>(vshlq_s32(v.raw, Neg(bits).raw));
1597}
1598template <size_t N, HWY_IF_LE64(int32_t, N)>
1600 const Vec128<int32_t, N> bits) {
1601 return Vec128<int32_t, N>(vshl_s32(v.raw, Neg(bits).raw));
1602}
1603
1605 const Vec128<int64_t> bits) {
1606 return Vec128<int64_t>(vshlq_s64(v.raw, Neg(bits).raw));
1607}
1609 const Vec64<int64_t> bits) {
1610 return Vec64<int64_t>(vshl_s64(v.raw, Neg(bits).raw));
1611}
1612
1613// ------------------------------ ShiftLeftSame (Shl)
1614
1615template <typename T, size_t N>
1617 return v << Set(Simd<T, N, 0>(), static_cast<T>(bits));
1618}
1619template <typename T, size_t N>
1621 return v >> Set(Simd<T, N, 0>(), static_cast<T>(bits));
1622}
1623
1624// ------------------------------ Integer multiplication
1625
1626// Unsigned
1628 const Vec128<uint16_t> b) {
1629 return Vec128<uint16_t>(vmulq_u16(a.raw, b.raw));
1630}
1632 const Vec128<uint32_t> b) {
1633 return Vec128<uint32_t>(vmulq_u32(a.raw, b.raw));
1634}
1635
1636template <size_t N, HWY_IF_LE64(uint16_t, N)>
1638 const Vec128<uint16_t, N> b) {
1639 return Vec128<uint16_t, N>(vmul_u16(a.raw, b.raw));
1640}
1641template <size_t N, HWY_IF_LE64(uint32_t, N)>
1643 const Vec128<uint32_t, N> b) {
1644 return Vec128<uint32_t, N>(vmul_u32(a.raw, b.raw));
1645}
1646
1647// Signed
1649 const Vec128<int16_t> b) {
1650 return Vec128<int16_t>(vmulq_s16(a.raw, b.raw));
1651}
1653 const Vec128<int32_t> b) {
1654 return Vec128<int32_t>(vmulq_s32(a.raw, b.raw));
1655}
1656
1657template <size_t N, HWY_IF_LE64(uint16_t, N)>
1659 const Vec128<int16_t, N> b) {
1660 return Vec128<int16_t, N>(vmul_s16(a.raw, b.raw));
1661}
1662template <size_t N, HWY_IF_LE64(int32_t, N)>
1664 const Vec128<int32_t, N> b) {
1665 return Vec128<int32_t, N>(vmul_s32(a.raw, b.raw));
1666}
1667
1668// Returns the upper 16 bits of a * b in each lane.
1670 const Vec128<int16_t> b) {
1671 int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
1672#if HWY_ARCH_ARM_A64
1673 int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
1674#else
1675 int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
1676#endif
1677 return Vec128<int16_t>(
1678 vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
1679}
1681 const Vec128<uint16_t> b) {
1682 uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
1683#if HWY_ARCH_ARM_A64
1684 uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
1685#else
1686 uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
1687#endif
1688 return Vec128<uint16_t>(
1689 vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
1690}
1691
1692template <size_t N, HWY_IF_LE64(int16_t, N)>
1694 const Vec128<int16_t, N> b) {
1695 int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw));
1696 return Vec128<int16_t, N>(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo)));
1697}
1698template <size_t N, HWY_IF_LE64(uint16_t, N)>
1700 const Vec128<uint16_t, N> b) {
1701 uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw));
1702 return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
1703}
1704
1706 return Vec128<int16_t>(vqrdmulhq_s16(a.raw, b.raw));
1707}
1708template <size_t N, HWY_IF_LE64(int16_t, N)>
1711 return Vec128<int16_t, N>(vqrdmulh_s16(a.raw, b.raw));
1712}
1713
1714// ------------------------------ Floating-point mul / div
1715
1716HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
1717
1718// Approximate reciprocal
1720 return Vec128<float>(vrecpeq_f32(v.raw));
1721}
1722template <size_t N>
1724 return Vec128<float, N>(vrecpe_f32(v.raw));
1725}
1726
1727#if HWY_ARCH_ARM_A64
1728HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
1729#else
1730// Not defined on armv7: approximate
1731namespace detail {
1732
1734 const Vec128<float> recip, const Vec128<float> divisor) {
1735 return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw));
1736}
1737template <size_t N>
1739 const Vec128<float, N> recip, Vec128<float, N> divisor) {
1740 return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw));
1741}
1742
1743} // namespace detail
1744
1745template <size_t N>
1747 const Vec128<float, N> b) {
1748 auto x = ApproximateReciprocal(b);
1752 return a * x;
1753}
1754#endif
1755
1756// ------------------------------ Absolute value of difference.
1757
1759 return Vec128<float>(vabdq_f32(a.raw, b.raw));
1760}
1761template <size_t N, HWY_IF_LE64(float, N)>
1763 const Vec128<float, N> b) {
1764 return Vec128<float, N>(vabd_f32(a.raw, b.raw));
1765}
1766
1767// ------------------------------ Floating-point multiply-add variants
1768
1769// Returns add + mul * x
1770#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1771template <size_t N, HWY_IF_LE64(float, N)>
1772HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
1773 const Vec128<float, N> x,
1774 const Vec128<float, N> add) {
1775 return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
1776}
1777HWY_API Vec128<float> MulAdd(const Vec128<float> mul, const Vec128<float> x,
1778 const Vec128<float> add) {
1779 return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
1780}
1781#else
1782// Emulate FMA for floats.
1783template <size_t N>
1785 const Vec128<float, N> x,
1786 const Vec128<float, N> add) {
1787 return mul * x + add;
1788}
1789#endif
1790
1791#if HWY_ARCH_ARM_A64
1792HWY_API Vec64<double> MulAdd(const Vec64<double> mul, const Vec64<double> x,
1793 const Vec64<double> add) {
1794 return Vec64<double>(vfma_f64(add.raw, mul.raw, x.raw));
1795}
1796HWY_API Vec128<double> MulAdd(const Vec128<double> mul, const Vec128<double> x,
1797 const Vec128<double> add) {
1798 return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
1799}
1800#endif
1801
1802// Returns add - mul * x
1803#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1804template <size_t N, HWY_IF_LE64(float, N)>
1805HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
1806 const Vec128<float, N> x,
1807 const Vec128<float, N> add) {
1808 return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
1809}
1810HWY_API Vec128<float> NegMulAdd(const Vec128<float> mul, const Vec128<float> x,
1811 const Vec128<float> add) {
1812 return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
1813}
1814#else
1815// Emulate FMA for floats.
1816template <size_t N>
1818 const Vec128<float, N> x,
1819 const Vec128<float, N> add) {
1820 return add - mul * x;
1821}
1822#endif
1823
1824#if HWY_ARCH_ARM_A64
1825HWY_API Vec64<double> NegMulAdd(const Vec64<double> mul, const Vec64<double> x,
1826 const Vec64<double> add) {
1827 return Vec64<double>(vfms_f64(add.raw, mul.raw, x.raw));
1828}
1829HWY_API Vec128<double> NegMulAdd(const Vec128<double> mul,
1830 const Vec128<double> x,
1831 const Vec128<double> add) {
1832 return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
1833}
1834#endif
1835
1836// Returns mul * x - sub
1837template <size_t N>
1839 const Vec128<float, N> x,
1840 const Vec128<float, N> sub) {
1841 return MulAdd(mul, x, Neg(sub));
1842}
1843
1844// Returns -mul * x - sub
1845template <size_t N>
1847 const Vec128<float, N> x,
1848 const Vec128<float, N> sub) {
1849 return Neg(MulAdd(mul, x, sub));
1850}
1851
1852#if HWY_ARCH_ARM_A64
1853template <size_t N>
1854HWY_API Vec128<double, N> MulSub(const Vec128<double, N> mul,
1855 const Vec128<double, N> x,
1856 const Vec128<double, N> sub) {
1857 return MulAdd(mul, x, Neg(sub));
1858}
1859template <size_t N>
1860HWY_API Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
1861 const Vec128<double, N> x,
1862 const Vec128<double, N> sub) {
1863 return Neg(MulAdd(mul, x, sub));
1864}
1865#endif
1866
1867// ------------------------------ Floating-point square root (IfThenZeroElse)
1868
1869// Approximate reciprocal square root
1871 return Vec128<float>(vrsqrteq_f32(v.raw));
1872}
1873template <size_t N>
1875 return Vec128<float, N>(vrsqrte_f32(v.raw));
1876}
1877
1878// Full precision square root
1879#if HWY_ARCH_ARM_A64
1881#else
1882namespace detail {
1883
1885 const Vec128<float> recip) {
1886 return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw));
1887}
1888template <size_t N>
1890 Vec128<float, N> recip) {
1891 return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw));
1892}
1893
1894} // namespace detail
1895
1896// Not defined on armv7: approximate
1897template <size_t N>
1899 auto recip = ApproximateReciprocalSqrt(v);
1900
1901 recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1902 recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1903 recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1904
1905 const auto root = v * recip;
1906 return IfThenZeroElse(v == Zero(Simd<float, N, 0>()), root);
1907}
1908#endif
1909
1910// ================================================== LOGICAL
1911
1912// ------------------------------ Not
1913
1914// There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
1915template <typename T>
1917 const Full128<T> d;
1918 const Repartition<uint8_t, decltype(d)> d8;
1919 return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
1920}
1921template <typename T, size_t N, HWY_IF_LE64(T, N)>
1923 const Simd<T, N, 0> d;
1924 const Repartition<uint8_t, decltype(d)> d8;
1925 using V8 = decltype(Zero(d8));
1926 return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
1927}
1928
1929// ------------------------------ And
1931
1932// Uses the u32/64 defined above.
1933template <typename T, size_t N, HWY_IF_FLOAT(T)>
1934HWY_API Vec128<T, N> And(const Vec128<T, N> a, const Vec128<T, N> b) {
1935 const DFromV<decltype(a)> d;
1936 const RebindToUnsigned<decltype(d)> du;
1937 return BitCast(d, BitCast(du, a) & BitCast(du, b));
1938}
1939
1940// ------------------------------ AndNot
1941
1942namespace detail {
1943// reversed_andnot returns a & ~b.
1944HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2)
1945} // namespace detail
1946
1947// Returns ~not_mask & mask.
1948template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
1950 const Vec128<T, N> mask) {
1951 return detail::reversed_andnot(mask, not_mask);
1952}
1953
1954// Uses the u32/64 defined above.
1955template <typename T, size_t N, HWY_IF_FLOAT(T)>
1956HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask,
1957 const Vec128<T, N> mask) {
1958 const DFromV<decltype(mask)> d;
1959 const RebindToUnsigned<decltype(d)> du;
1960 VFromD<decltype(du)> ret =
1961 detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask));
1962 return BitCast(d, ret);
1963}
1964
1965// ------------------------------ Or
1966
1968
1969// Uses the u32/64 defined above.
1970template <typename T, size_t N, HWY_IF_FLOAT(T)>
1971HWY_API Vec128<T, N> Or(const Vec128<T, N> a, const Vec128<T, N> b) {
1972 const DFromV<decltype(a)> d;
1973 const RebindToUnsigned<decltype(d)> du;
1974 return BitCast(d, BitCast(du, a) | BitCast(du, b));
1975}
1976
1977// ------------------------------ Xor
1978
1980
1981// Uses the u32/64 defined above.
1982template <typename T, size_t N, HWY_IF_FLOAT(T)>
1983HWY_API Vec128<T, N> Xor(const Vec128<T, N> a, const Vec128<T, N> b) {
1984 const DFromV<decltype(a)> d;
1985 const RebindToUnsigned<decltype(d)> du;
1986 return BitCast(d, BitCast(du, a) ^ BitCast(du, b));
1987}
1988
1989// ------------------------------ Or3
1990
1991template <typename T, size_t N>
1993 return Or(o1, Or(o2, o3));
1994}
1995
1996// ------------------------------ OrAnd
1997
1998template <typename T, size_t N>
2000 return Or(o, And(a1, a2));
2001}
2002
2003// ------------------------------ IfVecThenElse
2004
2005template <typename T, size_t N>
2007 Vec128<T, N> no) {
2008 return IfThenElse(MaskFromVec(mask), yes, no);
2009}
2010
2011// ------------------------------ Operator overloads (internal-only if float)
2012
2013template <typename T, size_t N>
2015 return And(a, b);
2016}
2017
2018template <typename T, size_t N>
2020 return Or(a, b);
2021}
2022
2023template <typename T, size_t N>
2025 return Xor(a, b);
2026}
2027
2028// ------------------------------ PopulationCount
2029
2030#ifdef HWY_NATIVE_POPCNT
2031#undef HWY_NATIVE_POPCNT
2032#else
2033#define HWY_NATIVE_POPCNT
2034#endif
2035
2036namespace detail {
2037
2038template <typename T>
2040 const Full128<uint8_t> d8;
2041 return Vec128<T>(vcntq_u8(BitCast(d8, v).raw));
2042}
2043template <typename T, size_t N, HWY_IF_LE64(T, N)>
2045 Vec128<T, N> v) {
2046 const Simd<uint8_t, N, 0> d8;
2047 return Vec128<T, N>(vcnt_u8(BitCast(d8, v).raw));
2048}
2049
2050// ARM lacks popcount for lane sizes > 1, so take pairwise sums of the bytes.
2051template <typename T>
2053 const Full128<uint8_t> d8;
2054 const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
2055 return Vec128<T>(vpaddlq_u8(bytes));
2056}
2057template <typename T, size_t N, HWY_IF_LE64(T, N)>
2059 Vec128<T, N> v) {
2061 const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
2062 return Vec128<T, N>(vpaddl_u8(bytes));
2063}
2064
2065template <typename T>
2067 const Full128<uint8_t> d8;
2068 const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
2069 return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
2070}
2071template <typename T, size_t N, HWY_IF_LE64(T, N)>
2073 Vec128<T, N> v) {
2075 const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
2076 return Vec128<T, N>(vpaddl_u16(vpaddl_u8(bytes)));
2077}
2078
2079template <typename T>
2081 const Full128<uint8_t> d8;
2082 const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
2083 return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
2084}
2085template <typename T, size_t N, HWY_IF_LE64(T, N)>
2087 Vec128<T, N> v) {
2089 const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
2090 return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
2091}
2092
2093} // namespace detail
2094
2095template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
2097 return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
2098}
2099
2100// ================================================== SIGN
2101
2102// ------------------------------ Abs
2103
2104// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
2106 return Vec128<int8_t>(vabsq_s8(v.raw));
2107}
2109 return Vec128<int16_t>(vabsq_s16(v.raw));
2110}
2112 return Vec128<int32_t>(vabsq_s32(v.raw));
2113}
2114// i64 is implemented after BroadcastSignBit.
2116 return Vec128<float>(vabsq_f32(v.raw));
2117}
2118
2119template <size_t N, HWY_IF_LE64(int8_t, N)>
2121 return Vec128<int8_t, N>(vabs_s8(v.raw));
2122}
2123template <size_t N, HWY_IF_LE64(int16_t, N)>
2125 return Vec128<int16_t, N>(vabs_s16(v.raw));
2126}
2127template <size_t N, HWY_IF_LE64(int32_t, N)>
2129 return Vec128<int32_t, N>(vabs_s32(v.raw));
2130}
2131template <size_t N, HWY_IF_LE64(float, N)>
2133 return Vec128<float, N>(vabs_f32(v.raw));
2134}
2135
2136#if HWY_ARCH_ARM_A64
2137HWY_API Vec128<double> Abs(const Vec128<double> v) {
2138 return Vec128<double>(vabsq_f64(v.raw));
2139}
2140
2141HWY_API Vec64<double> Abs(const Vec64<double> v) {
2142 return Vec64<double>(vabs_f64(v.raw));
2143}
2144#endif
2145
2146// ------------------------------ CopySign
2147
2148template <typename T, size_t N>
2150 const Vec128<T, N> sign) {
2151 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
2152 const auto msb = SignBit(Simd<T, N, 0>());
2153 return Or(AndNot(msb, magn), And(msb, sign));
2154}
2155
2156template <typename T, size_t N>
2158 const Vec128<T, N> sign) {
2159 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
2160 return Or(abs, And(SignBit(Simd<T, N, 0>()), sign));
2161}
2162
2163// ------------------------------ BroadcastSignBit
2164
2165template <typename T, size_t N, HWY_IF_SIGNED(T)>
2167 return ShiftRight<sizeof(T) * 8 - 1>(v);
2168}
2169
2170// ================================================== MASK
2171
2172// ------------------------------ To/from vector
2173
2174// Mask and Vec have the same representation (true = FF..FF).
2175template <typename T, size_t N>
2177 const Simd<MakeUnsigned<T>, N, 0> du;
2178 return Mask128<T, N>(BitCast(du, v).raw);
2179}
2180
2181template <typename T, size_t N>
2183 return BitCast(d, Vec128<MakeUnsigned<T>, N>(v.raw));
2184}
2185
2186// ------------------------------ RebindMask
2187
2188template <typename TFrom, typename TTo, size_t N>
2190 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
2192}
2193
2194// ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a.
2195
2196#define HWY_NEON_BUILD_TPL_HWY_IF
2197#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
2198#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
2199 const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
2200 const Vec128<type##_t, size> no
2201#define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
2202
2204
2205#undef HWY_NEON_BUILD_TPL_HWY_IF
2206#undef HWY_NEON_BUILD_RET_HWY_IF
2207#undef HWY_NEON_BUILD_PARAM_HWY_IF
2208#undef HWY_NEON_BUILD_ARG_HWY_IF
2209
2210// mask ? yes : 0
2211template <typename T, size_t N>
2213 const Vec128<T, N> yes) {
2214 return yes & VecFromMask(Simd<T, N, 0>(), mask);
2215}
2216
2217// mask ? 0 : no
2218template <typename T, size_t N>
2220 const Vec128<T, N> no) {
2221 return AndNot(VecFromMask(Simd<T, N, 0>(), mask), no);
2222}
2223
2224template <typename T, size_t N>
2226 Vec128<T, N> no) {
2227 static_assert(IsSigned<T>(), "Only works for signed/float");
2228 const Simd<T, N, 0> d;
2229 const RebindToSigned<decltype(d)> di;
2230
2232 return IfThenElse(m, yes, no);
2233}
2234
2235template <typename T, size_t N>
2237 const auto zero = Zero(Simd<T, N, 0>());
2238 return Max(zero, v);
2239}
2240
2241// ------------------------------ Mask logical
2242
2243template <typename T, size_t N>
2246}
2247
2248template <typename T, size_t N>
2250 const Simd<T, N, 0> d;
2251 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
2252}
2253
2254template <typename T, size_t N>
2256 const Simd<T, N, 0> d;
2257 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
2258}
2259
2260template <typename T, size_t N>
2262 const Simd<T, N, 0> d;
2263 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
2264}
2265
2266template <typename T, size_t N>
2268 const Simd<T, N, 0> d;
2269 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
2270}
2271
2272// ================================================== COMPARE
2273
2274// Comparisons fill a lane with 1-bits if the condition is true, else 0.
2275
2276// ------------------------------ Shuffle2301 (for i64 compares)
2277
2278// Swap 32-bit halves in 64-bits
2280 return Vec64<uint32_t>(vrev64_u32(v.raw));
2281}
2283 return Vec64<int32_t>(vrev64_s32(v.raw));
2284}
2286 return Vec64<float>(vrev64_f32(v.raw));
2287}
2289 return Vec128<uint32_t>(vrev64q_u32(v.raw));
2290}
2292 return Vec128<int32_t>(vrev64q_s32(v.raw));
2293}
2295 return Vec128<float>(vrev64q_f32(v.raw));
2296}
2297
2298#define HWY_NEON_BUILD_TPL_HWY_COMPARE
2299#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
2300#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
2301 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
2302#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
2303
2304// ------------------------------ Equality
2305HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
2306#if HWY_ARCH_ARM_A64
2307HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
2308#else
2309// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
2310HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
2311HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
2312#endif
2313
2314// ------------------------------ Strict inequality (signed, float)
2315#if HWY_ARCH_ARM_A64
2316HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<, vclt, _, HWY_COMPARE)
2317#else
2318HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<, vclt, _, HWY_COMPARE)
2319HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
2320#endif
2321HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
2322
2323// ------------------------------ Weak inequality (float)
2324HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
2325
2326#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
2327#undef HWY_NEON_BUILD_RET_HWY_COMPARE
2328#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
2329#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
2330
2331// ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq)
2332
2333#if HWY_ARCH_ARM_V7
2334
2335template <size_t N>
2336HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
2337 const Vec128<int64_t, N> b) {
2338 const Simd<int32_t, N * 2, 0> d32;
2339 const Simd<int64_t, N, 0> d64;
2340 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
2341 const auto cmp64 = cmp32 & Shuffle2301(cmp32);
2342 return MaskFromVec(BitCast(d64, cmp64));
2343}
2344
2345template <size_t N>
2346HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
2347 const Vec128<uint64_t, N> b) {
2348 const Simd<uint32_t, N * 2, 0> d32;
2349 const Simd<uint64_t, N, 0> d64;
2350 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
2351 const auto cmp64 = cmp32 & Shuffle2301(cmp32);
2352 return MaskFromVec(BitCast(d64, cmp64));
2353}
2354
2355HWY_API Mask128<int64_t> operator<(const Vec128<int64_t> a,
2356 const Vec128<int64_t> b) {
2357 const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
2358 return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
2359}
2360HWY_API Mask128<int64_t, 1> operator<(const Vec64<int64_t> a,
2361 const Vec64<int64_t> b) {
2362 const int64x1_t sub = vqsub_s64(a.raw, b.raw);
2363 return MaskFromVec(BroadcastSignBit(Vec64<int64_t>(sub)));
2364}
2365
2366template <size_t N>
2367HWY_API Mask128<uint64_t, N> operator<(const Vec128<uint64_t, N> a,
2368 const Vec128<uint64_t, N> b) {
2369 const DFromV<decltype(a)> du;
2370 const RebindToSigned<decltype(du)> di;
2371 const Vec128<uint64_t, N> msb = AndNot(a, b) | AndNot(a ^ b, a - b);
2372 return MaskFromVec(BitCast(du, BroadcastSignBit(BitCast(di, msb))));
2373}
2374
2375#endif
2376
2377// ------------------------------ operator!= (operator==)
2378
2379// Customize HWY_NEON_DEF_FUNCTION to call 2 functions.
2380#pragma push_macro("HWY_NEON_DEF_FUNCTION")
2381#undef HWY_NEON_DEF_FUNCTION
2382// This cannot have _any_ template argument (in x86_128 we can at least have N
2383// as an argument), otherwise it is not more specialized than rewritten
2384// operator== in C++20, leading to compile errors.
2385#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
2386 HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a, \
2387 Vec128<type##_t, size> b) { \
2388 return Not(a == b); \
2389 }
2390
2391HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator!=, ignored, ignored, ignored)
2392
2393#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
2394
2395// ------------------------------ Reversed comparisons
2396
2397template <typename T, size_t N>
2399 return operator<(b, a);
2400}
2401template <typename T, size_t N>
2403 return operator<=(b, a);
2404}
2405
2406// ------------------------------ FirstN (Iota, Lt)
2407
2408template <typename T, size_t N>
2410 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
2411 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
2412}
2413
2414// ------------------------------ TestBit (Eq)
2415
2416#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
2417#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
2418#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
2419 Vec128<type##_t, size> v, Vec128<type##_t, size> bit
2420#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
2421
2422#if HWY_ARCH_ARM_A64
2423HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
2424#else
2425// No 64-bit versions on armv7
2426HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
2427HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
2428
2429template <size_t N>
2431 Vec128<uint64_t, N> bit) {
2432 return (v & bit) == bit;
2433}
2434template <size_t N>
2436 Vec128<int64_t, N> bit) {
2437 return (v & bit) == bit;
2438}
2439
2440#endif
2441#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
2442#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
2443#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
2444#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
2445
2446// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
2448#if HWY_ARCH_ARM_A64
2449 return Vec128<int64_t>(vabsq_s64(v.raw));
2450#else
2451 const auto zero = Zero(Full128<int64_t>());
2452 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2453#endif
2454}
2456#if HWY_ARCH_ARM_A64
2457 return Vec64<int64_t>(vabs_s64(v.raw));
2458#else
2459 const auto zero = Zero(Full64<int64_t>());
2460 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2461#endif
2462}
2463
2464// ------------------------------ Min (IfThenElse, BroadcastSignBit)
2465
2466// Unsigned
2468
2469template <size_t N>
2470HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
2471 const Vec128<uint64_t, N> b) {
2472#if HWY_ARCH_ARM_A64
2473 return IfThenElse(b < a, b, a);
2474#else
2475 const DFromV<decltype(a)> du;
2476 const RebindToSigned<decltype(du)> di;
2477 return BitCast(du, BitCast(di, a) - BitCast(di, detail::SaturatedSub(a, b)));
2478#endif
2479}
2480
2481// Signed
2483
2484template <size_t N>
2485HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
2486 const Vec128<int64_t, N> b) {
2487#if HWY_ARCH_ARM_A64
2488 return IfThenElse(b < a, b, a);
2489#else
2490 const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
2491 return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
2492#endif
2493}
2494
2495// Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN.
2496#if HWY_ARCH_ARM_A64
2498#else
2500#endif
2501
2502// ------------------------------ Max (IfThenElse, BroadcastSignBit)
2503
2504// Unsigned (no u64)
2506
2507template <size_t N>
2508HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
2509 const Vec128<uint64_t, N> b) {
2510#if HWY_ARCH_ARM_A64
2511 return IfThenElse(b < a, a, b);
2512#else
2513 const DFromV<decltype(a)> du;
2514 const RebindToSigned<decltype(du)> di;
2515 return BitCast(du, BitCast(di, b) + BitCast(di, detail::SaturatedSub(a, b)));
2516#endif
2517}
2518
2519// Signed (no i64)
2521
2522template <size_t N>
2523HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
2524 const Vec128<int64_t, N> b) {
2525#if HWY_ARCH_ARM_A64
2526 return IfThenElse(b < a, a, b);
2527#else
2528 const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
2529 return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
2530#endif
2531}
2532
2533// Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN.
2534#if HWY_ARCH_ARM_A64
2536#else
2538#endif
2539
2540// ================================================== MEMORY
2541
2542// ------------------------------ Load 128
2543
2545 const uint8_t* HWY_RESTRICT unaligned) {
2546 return Vec128<uint8_t>(vld1q_u8(unaligned));
2547}
2549 const uint16_t* HWY_RESTRICT unaligned) {
2550 return Vec128<uint16_t>(vld1q_u16(unaligned));
2551}
2553 const uint32_t* HWY_RESTRICT unaligned) {
2554 return Vec128<uint32_t>(vld1q_u32(unaligned));
2555}
2557 const uint64_t* HWY_RESTRICT unaligned) {
2558 return Vec128<uint64_t>(vld1q_u64(unaligned));
2559}
2561 const int8_t* HWY_RESTRICT unaligned) {
2562 return Vec128<int8_t>(vld1q_s8(unaligned));
2563}
2565 const int16_t* HWY_RESTRICT unaligned) {
2566 return Vec128<int16_t>(vld1q_s16(unaligned));
2567}
2569 const int32_t* HWY_RESTRICT unaligned) {
2570 return Vec128<int32_t>(vld1q_s32(unaligned));
2571}
2573 const int64_t* HWY_RESTRICT unaligned) {
2574 return Vec128<int64_t>(vld1q_s64(unaligned));
2575}
2577 const float* HWY_RESTRICT unaligned) {
2578 return Vec128<float>(vld1q_f32(unaligned));
2579}
2580#if HWY_ARCH_ARM_A64
2581HWY_API Vec128<double> LoadU(Full128<double> /* tag */,
2582 const double* HWY_RESTRICT unaligned) {
2583 return Vec128<double>(vld1q_f64(unaligned));
2584}
2585#endif
2586
2587// ------------------------------ Load 64
2588
2590 const uint8_t* HWY_RESTRICT p) {
2591 return Vec64<uint8_t>(vld1_u8(p));
2592}
2594 const uint16_t* HWY_RESTRICT p) {
2595 return Vec64<uint16_t>(vld1_u16(p));
2596}
2598 const uint32_t* HWY_RESTRICT p) {
2599 return Vec64<uint32_t>(vld1_u32(p));
2600}
2602 const uint64_t* HWY_RESTRICT p) {
2603 return Vec64<uint64_t>(vld1_u64(p));
2604}
2606 const int8_t* HWY_RESTRICT p) {
2607 return Vec64<int8_t>(vld1_s8(p));
2608}
2610 const int16_t* HWY_RESTRICT p) {
2611 return Vec64<int16_t>(vld1_s16(p));
2612}
2614 const int32_t* HWY_RESTRICT p) {
2615 return Vec64<int32_t>(vld1_s32(p));
2616}
2618 const int64_t* HWY_RESTRICT p) {
2619 return Vec64<int64_t>(vld1_s64(p));
2620}
2622 const float* HWY_RESTRICT p) {
2623 return Vec64<float>(vld1_f32(p));
2624}
2625#if HWY_ARCH_ARM_A64
2626HWY_API Vec64<double> LoadU(Full64<double> /* tag */,
2627 const double* HWY_RESTRICT p) {
2628 return Vec64<double>(vld1_f64(p));
2629}
2630#endif
2631// ------------------------------ Load 32
2632
2633// Actual 32-bit broadcast load - used to implement the other lane types
2634// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
2636 const uint32_t* HWY_RESTRICT p) {
2637 return Vec32<uint32_t>(vld1_dup_u32(p));
2638}
2640 const int32_t* HWY_RESTRICT p) {
2641 return Vec32<int32_t>(vld1_dup_s32(p));
2642}
2644 return Vec32<float>(vld1_dup_f32(p));
2645}
2646
2647template <typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
2649 const Repartition<uint32_t, decltype(d)> d32;
2650 uint32_t buf;
2651 CopyBytes<4>(p, &buf);
2652 return BitCast(d, LoadU(d32, &buf));
2653}
2654
2655// ------------------------------ Load 16
2656
2657// Actual 16-bit broadcast load - used to implement the other lane types
2658// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
2660 const uint16_t* HWY_RESTRICT p) {
2661 return Vec128<uint16_t, 1>(vld1_dup_u16(p));
2662}
2664 const int16_t* HWY_RESTRICT p) {
2665 return Vec128<int16_t, 1>(vld1_dup_s16(p));
2666}
2667
2668template <typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
2670 const Repartition<uint16_t, decltype(d)> d16;
2671 uint16_t buf;
2672 CopyBytes<2>(p, &buf);
2673 return BitCast(d, LoadU(d16, &buf));
2674}
2675
2676// ------------------------------ Load 8
2677
2679 const uint8_t* HWY_RESTRICT p) {
2680 return Vec128<uint8_t, 1>(vld1_dup_u8(p));
2681}
2682
2684 const int8_t* HWY_RESTRICT p) {
2685 return Vec128<int8_t, 1>(vld1_dup_s8(p));
2686}
2687
2688// [b]float16_t use the same Raw as uint16_t, so forward to that.
2689template <size_t N>
2691 const float16_t* HWY_RESTRICT p) {
2692 const RebindToUnsigned<decltype(d)> du16;
2693 const auto pu16 = reinterpret_cast<const uint16_t*>(p);
2694 return Vec128<float16_t, N>(LoadU(du16, pu16).raw);
2695}
2696template <size_t N>
2698 const bfloat16_t* HWY_RESTRICT p) {
2699 const RebindToUnsigned<decltype(d)> du16;
2700 const auto pu16 = reinterpret_cast<const uint16_t*>(p);
2701 return Vec128<bfloat16_t, N>(LoadU(du16, pu16).raw);
2702}
2703
2704// On ARM, Load is the same as LoadU.
2705template <typename T, size_t N>
2707 return LoadU(d, p);
2708}
2709
2710template <typename T, size_t N>
2712 const T* HWY_RESTRICT aligned) {
2713 return IfThenElseZero(m, Load(d, aligned));
2714}
2715
2716// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
2717template <typename T, size_t N, HWY_IF_LE128(T, N)>
2719 const T* const HWY_RESTRICT p) {
2720 return LoadU(d, p);
2721}
2722
2723// ------------------------------ Store 128
2724
2726 uint8_t* HWY_RESTRICT unaligned) {
2727 vst1q_u8(unaligned, v.raw);
2728}
2730 uint16_t* HWY_RESTRICT unaligned) {
2731 vst1q_u16(unaligned, v.raw);
2732}
2734 uint32_t* HWY_RESTRICT unaligned) {
2735 vst1q_u32(unaligned, v.raw);
2736}
2738 uint64_t* HWY_RESTRICT unaligned) {
2739 vst1q_u64(unaligned, v.raw);
2740}
2742 int8_t* HWY_RESTRICT unaligned) {
2743 vst1q_s8(unaligned, v.raw);
2744}
2746 int16_t* HWY_RESTRICT unaligned) {
2747 vst1q_s16(unaligned, v.raw);
2748}
2750 int32_t* HWY_RESTRICT unaligned) {
2751 vst1q_s32(unaligned, v.raw);
2752}
2754 int64_t* HWY_RESTRICT unaligned) {
2755 vst1q_s64(unaligned, v.raw);
2756}
2758 float* HWY_RESTRICT unaligned) {
2759 vst1q_f32(unaligned, v.raw);
2760}
2761#if HWY_ARCH_ARM_A64
2762HWY_API void StoreU(const Vec128<double> v, Full128<double> /* tag */,
2763 double* HWY_RESTRICT unaligned) {
2764 vst1q_f64(unaligned, v.raw);
2765}
2766#endif
2767
2768// ------------------------------ Store 64
2769
2771 uint8_t* HWY_RESTRICT p) {
2772 vst1_u8(p, v.raw);
2773}
2775 uint16_t* HWY_RESTRICT p) {
2776 vst1_u16(p, v.raw);
2777}
2779 uint32_t* HWY_RESTRICT p) {
2780 vst1_u32(p, v.raw);
2781}
2783 uint64_t* HWY_RESTRICT p) {
2784 vst1_u64(p, v.raw);
2785}
2787 int8_t* HWY_RESTRICT p) {
2788 vst1_s8(p, v.raw);
2789}
2791 int16_t* HWY_RESTRICT p) {
2792 vst1_s16(p, v.raw);
2793}
2795 int32_t* HWY_RESTRICT p) {
2796 vst1_s32(p, v.raw);
2797}
2799 int64_t* HWY_RESTRICT p) {
2800 vst1_s64(p, v.raw);
2801}
2803 float* HWY_RESTRICT p) {
2804 vst1_f32(p, v.raw);
2805}
2806#if HWY_ARCH_ARM_A64
2807HWY_API void StoreU(const Vec64<double> v, Full64<double> /* tag */,
2808 double* HWY_RESTRICT p) {
2809 vst1_f64(p, v.raw);
2810}
2811#endif
2812
2813// ------------------------------ Store 32
2814
2816 uint32_t* HWY_RESTRICT p) {
2817 vst1_lane_u32(p, v.raw, 0);
2818}
2820 int32_t* HWY_RESTRICT p) {
2821 vst1_lane_s32(p, v.raw, 0);
2822}
2824 float* HWY_RESTRICT p) {
2825 vst1_lane_f32(p, v.raw, 0);
2826}
2827
2828template <typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
2830 const Repartition<uint32_t, decltype(d)> d32;
2831 const uint32_t buf = GetLane(BitCast(d32, v));
2832 CopyBytes<4>(&buf, p);
2833}
2834
2835// ------------------------------ Store 16
2836
2838 uint16_t* HWY_RESTRICT p) {
2839 vst1_lane_u16(p, v.raw, 0);
2840}
2842 int16_t* HWY_RESTRICT p) {
2843 vst1_lane_s16(p, v.raw, 0);
2844}
2845
2846template <typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
2848 const Repartition<uint16_t, decltype(d)> d16;
2849 const uint16_t buf = GetLane(BitCast(d16, v));
2850 CopyBytes<2>(&buf, p);
2851}
2852
2853// ------------------------------ Store 8
2854
2856 uint8_t* HWY_RESTRICT p) {
2857 vst1_lane_u8(p, v.raw, 0);
2858}
2860 int8_t* HWY_RESTRICT p) {
2861 vst1_lane_s8(p, v.raw, 0);
2862}
2863
2864// [b]float16_t use the same Raw as uint16_t, so forward to that.
2865template <size_t N>
2868 const RebindToUnsigned<decltype(d)> du16;
2869 const auto pu16 = reinterpret_cast<uint16_t*>(p);
2870 return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
2871}
2872template <size_t N>
2875 const RebindToUnsigned<decltype(d)> du16;
2876 const auto pu16 = reinterpret_cast<uint16_t*>(p);
2877 return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
2878}
2879
2880// On ARM, Store is the same as StoreU.
2881template <typename T, size_t N>
2883 StoreU(v, d, aligned);
2884}
2885
2886template <typename T, size_t N>
2888 T* HWY_RESTRICT p) {
2889 // Treat as unsigned so that we correctly support float16.
2890 const RebindToUnsigned<decltype(d)> du;
2891 const auto blended =
2892 IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p)));
2893 StoreU(BitCast(d, blended), d, p);
2894}
2895
2896// ------------------------------ Non-temporal stores
2897
2898// Same as aligned stores on non-x86.
2899
2900template <typename T, size_t N>
2902 T* HWY_RESTRICT aligned) {
2903 Store(v, d, aligned);
2904}
2905
2906// ================================================== CONVERT
2907
2908// ------------------------------ Promotions (part w/ narrow lanes -> full)
2909
2910// Unsigned: zero-extend to full vector.
2912 const Vec64<uint8_t> v) {
2913 return Vec128<uint16_t>(vmovl_u8(v.raw));
2914}
2916 const Vec32<uint8_t> v) {
2917 uint16x8_t a = vmovl_u8(v.raw);
2918 return Vec128<uint32_t>(vmovl_u16(vget_low_u16(a)));
2919}
2921 const Vec64<uint16_t> v) {
2922 return Vec128<uint32_t>(vmovl_u16(v.raw));
2923}
2925 const Vec64<uint32_t> v) {
2926 return Vec128<uint64_t>(vmovl_u32(v.raw));
2927}
2929 return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw)));
2930}
2932 uint16x8_t a = vmovl_u8(v.raw);
2933 return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
2934}
2936 return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw)));
2937}
2938
2939// Unsigned: zero-extend to half vector.
2940template <size_t N, HWY_IF_LE64(uint16_t, N)>
2942 const Vec128<uint8_t, N> v) {
2943 return Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw)));
2944}
2945template <size_t N, HWY_IF_LE64(uint32_t, N)>
2947 const Vec128<uint8_t, N> v) {
2948 uint16x8_t a = vmovl_u8(v.raw);
2949 return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(vget_low_u16(a))));
2950}
2951template <size_t N>
2953 const Vec128<uint16_t, N> v) {
2954 return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(v.raw)));
2955}
2956template <size_t N, HWY_IF_LE64(uint64_t, N)>
2958 const Vec128<uint32_t, N> v) {
2959 return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw)));
2960}
2961template <size_t N, HWY_IF_LE64(int16_t, N)>
2963 const Vec128<uint8_t, N> v) {
2964 return BitCast(d, Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw))));
2965}
2966template <size_t N, HWY_IF_LE64(int32_t, N)>
2968 const Vec128<uint8_t, N> v) {
2969 uint16x8_t a = vmovl_u8(v.raw);
2970 uint32x4_t b = vmovl_u16(vget_low_u16(a));
2971 return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(b)));
2972}
2973template <size_t N, HWY_IF_LE64(int32_t, N)>
2975 const Vec128<uint16_t, N> v) {
2976 uint32x4_t a = vmovl_u16(v.raw);
2977 return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(a)));
2978}
2979
2980// Signed: replicate sign bit to full vector.
2982 const Vec64<int8_t> v) {
2983 return Vec128<int16_t>(vmovl_s8(v.raw));
2984}
2986 const Vec32<int8_t> v) {
2987 int16x8_t a = vmovl_s8(v.raw);
2988 return Vec128<int32_t>(vmovl_s16(vget_low_s16(a)));
2989}
2991 const Vec64<int16_t> v) {
2992 return Vec128<int32_t>(vmovl_s16(v.raw));
2993}
2995 const Vec64<int32_t> v) {
2996 return Vec128<int64_t>(vmovl_s32(v.raw));
2997}
2998
2999// Signed: replicate sign bit to half vector.
3000template <size_t N>
3002 const Vec128<int8_t, N> v) {
3003 return Vec128<int16_t, N>(vget_low_s16(vmovl_s8(v.raw)));
3004}
3005template <size_t N>
3007 const Vec128<int8_t, N> v) {
3008 int16x8_t a = vmovl_s8(v.raw);
3009 int32x4_t b = vmovl_s16(vget_low_s16(a));
3010 return Vec128<int32_t, N>(vget_low_s32(b));
3011}
3012template <size_t N>
3014 const Vec128<int16_t, N> v) {
3015 return Vec128<int32_t, N>(vget_low_s32(vmovl_s16(v.raw)));
3016}
3017template <size_t N>
3019 const Vec128<int32_t, N> v) {
3020 return Vec128<int64_t, N>(vget_low_s64(vmovl_s32(v.raw)));
3021}
3022
3023#if __ARM_FP & 2
3024
3025HWY_API Vec128<float> PromoteTo(Full128<float> /* tag */,
3026 const Vec128<float16_t, 4> v) {
3027 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
3028 return Vec128<float>(f32);
3029}
3030template <size_t N>
3031HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
3032 const Vec128<float16_t, N> v) {
3033 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
3034 return Vec128<float, N>(vget_low_f32(f32));
3035}
3036
3037#else
3038
3039template <size_t N>
3041 const Vec128<float16_t, N> v) {
3042 const RebindToSigned<decltype(df32)> di32;
3043 const RebindToUnsigned<decltype(df32)> du32;
3044 // Expand to u32 so we can shift.
3045 const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
3046 const auto sign = ShiftRight<15>(bits16);
3047 const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
3048 const auto mantissa = bits16 & Set(du32, 0x3FF);
3049 const auto subnormal =
3050 BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
3051 Set(df32, 1.0f / 16384 / 1024));
3052
3053 const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
3054 const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
3055 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3056 const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
3057 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3058}
3059
3060#endif
3061
3062#if HWY_ARCH_ARM_A64
3063
3064HWY_API Vec128<double> PromoteTo(Full128<double> /* tag */,
3065 const Vec64<float> v) {
3066 return Vec128<double>(vcvt_f64_f32(v.raw));
3067}
3068
3069HWY_API Vec64<double> PromoteTo(Full64<double> /* tag */,
3070 const Vec32<float> v) {
3071 return Vec64<double>(vget_low_f64(vcvt_f64_f32(v.raw)));
3072}
3073
3074HWY_API Vec128<double> PromoteTo(Full128<double> /* tag */,
3075 const Vec64<int32_t> v) {
3076 const int64x2_t i64 = vmovl_s32(v.raw);
3077 return Vec128<double>(vcvtq_f64_s64(i64));
3078}
3079
3080HWY_API Vec64<double> PromoteTo(Full64<double> /* tag */,
3081 const Vec32<int32_t> v) {
3082 const int64x1_t i64 = vget_low_s64(vmovl_s32(v.raw));
3083 return Vec64<double>(vcvt_f64_s64(i64));
3084}
3085
3086#endif
3087
3088// ------------------------------ Demotions (full -> part w/ narrow lanes)
3089
3090// From full vector to half or quarter
3092 const Vec128<int32_t> v) {
3093 return Vec64<uint16_t>(vqmovun_s32(v.raw));
3094}
3096 const Vec128<int32_t> v) {
3097 return Vec64<int16_t>(vqmovn_s32(v.raw));
3098}
3100 const Vec128<int32_t> v) {
3101 const uint16x4_t a = vqmovun_s32(v.raw);
3102 return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a)));
3103}
3105 const Vec128<int16_t> v) {
3106 return Vec64<uint8_t>(vqmovun_s16(v.raw));
3107}
3109 const Vec128<int32_t> v) {
3110 const int16x4_t a = vqmovn_s32(v.raw);
3111 return Vec32<int8_t>(vqmovn_s16(vcombine_s16(a, a)));
3112}
3114 const Vec128<int16_t> v) {
3115 return Vec64<int8_t>(vqmovn_s16(v.raw));
3116}
3117
3118// From half vector to partial half
3119template <size_t N, HWY_IF_LE64(int32_t, N)>
3121 const Vec128<int32_t, N> v) {
3122 return Vec128<uint16_t, N>(vqmovun_s32(vcombine_s32(v.raw, v.raw)));
3123}
3124template <size_t N, HWY_IF_LE64(int32_t, N)>
3126 const Vec128<int32_t, N> v) {
3127 return Vec128<int16_t, N>(vqmovn_s32(vcombine_s32(v.raw, v.raw)));
3128}
3129template <size_t N, HWY_IF_LE64(int32_t, N)>
3131 const Vec128<int32_t, N> v) {
3132 const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw));
3133 return Vec128<uint8_t, N>(vqmovn_u16(vcombine_u16(a, a)));
3134}
3135template <size_t N, HWY_IF_LE64(int16_t, N)>
3137 const Vec128<int16_t, N> v) {
3138 return Vec128<uint8_t, N>(vqmovun_s16(vcombine_s16(v.raw, v.raw)));
3139}
3140template <size_t N, HWY_IF_LE64(int32_t, N)>
3142 const Vec128<int32_t, N> v) {
3143 const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw));
3144 return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(a, a)));
3145}
3146template <size_t N, HWY_IF_LE64(int16_t, N)>
3148 const Vec128<int16_t, N> v) {
3149 return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(v.raw, v.raw)));
3150}
3151
3152#if __ARM_FP & 2
3153
3154HWY_API Vec128<float16_t, 4> DemoteTo(Full64<float16_t> /* tag */,
3155 const Vec128<float> v) {
3156 return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
3157}
3158template <size_t N>
3159HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> /* tag */,
3160 const Vec128<float, N> v) {
3161 const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
3162 return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
3163}
3164
3165#else
3166
3167template <size_t N>
3169 const Vec128<float, N> v) {
3170 const RebindToUnsigned<decltype(df16)> du16;
3171 const Rebind<uint32_t, decltype(du16)> du;
3172 const RebindToSigned<decltype(du)> di;
3173 const auto bits32 = BitCast(du, v);
3174 const auto sign = ShiftRight<31>(bits32);
3175 const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
3176 const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
3177
3178 const auto k15 = Set(di, 15);
3179 const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
3180 const auto is_tiny = exp < Set(di, -24);
3181
3182 const auto is_subnormal = exp < Set(di, -14);
3183 const auto biased_exp16 =
3184 BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
3185 const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
3186 const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
3187 (mantissa32 >> (Set(du, 13) + sub_exp));
3188 const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
3189 ShiftRight<13>(mantissa32)); // <1024
3190
3191 const auto sign16 = ShiftLeft<15>(sign);
3192 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3193 const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
3194 return Vec128<float16_t, N>(DemoteTo(du16, bits16).raw);
3195}
3196
3197#endif
3198
3199template <size_t N>
3201 const Vec128<float, N> v) {
3202 const Rebind<int32_t, decltype(dbf16)> di32;
3203 const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
3204 const Rebind<uint16_t, decltype(dbf16)> du16;
3205 const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
3206 return BitCast(dbf16, DemoteTo(du16, bits_in_32));
3207}
3208
3209#if HWY_ARCH_ARM_A64
3210
3211HWY_API Vec64<float> DemoteTo(Full64<float> /* tag */, const Vec128<double> v) {
3212 return Vec64<float>(vcvt_f32_f64(v.raw));
3213}
3214HWY_API Vec32<float> DemoteTo(Full32<float> /* tag */, const Vec64<double> v) {
3215 return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
3216}
3217
3218HWY_API Vec64<int32_t> DemoteTo(Full64<int32_t> /* tag */,
3219 const Vec128<double> v) {
3220 const int64x2_t i64 = vcvtq_s64_f64(v.raw);
3221 return Vec64<int32_t>(vqmovn_s64(i64));
3222}
3223HWY_API Vec32<int32_t> DemoteTo(Full32<int32_t> /* tag */,
3224 const Vec64<double> v) {
3225 const int64x1_t i64 = vcvt_s64_f64(v.raw);
3226 // There is no i64x1 -> i32x1 narrow, so expand to int64x2_t first.
3227 const int64x2_t i64x2 = vcombine_s64(i64, i64);
3228 return Vec32<int32_t>(vqmovn_s64(i64x2));
3229}
3230
3231#endif
3232
3234 const uint8x16_t org_v = detail::BitCastToByte(v).raw;
3235 const uint8x16_t w = vuzp1q_u8(org_v, org_v);
3236 return Vec32<uint8_t>(vget_low_u8(vuzp1q_u8(w, w)));
3237}
3238template <size_t N, HWY_IF_LE64(uint32_t, N)>
3240 const uint8x8_t org_v = detail::BitCastToByte(v).raw;
3241 const uint8x8_t w = vuzp1_u8(org_v, org_v);
3242 return Vec128<uint8_t, N>(vuzp1_u8(w, w));
3243}
3244
3245// In the following DemoteTo functions, |b| is purposely undefined.
3246// The value a needs to be extended to 128 bits so that vqmovn can be
3247// used and |b| is undefined so that no extra overhead is introduced.
3248HWY_DIAGNOSTICS(push)
3249HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
3250
3251template <size_t N>
3253 const Vec128<int32_t> v) {
3256 uint16x8_t c = vcombine_u16(a.raw, b.raw);
3257 return Vec128<uint8_t, N>(vqmovn_u16(c));
3258}
3259
3260template <size_t N>
3262 const Vec128<int32_t> v) {
3265 int16x8_t c = vcombine_s16(a.raw, b.raw);
3266 return Vec128<int8_t, N>(vqmovn_s16(c));
3267}
3268
3269HWY_DIAGNOSTICS(pop)
3270
3271// ------------------------------ Convert integer <=> floating-point
3272
3273HWY_API Vec128<float> ConvertTo(Full128<float> /* tag */,
3274 const Vec128<int32_t> v) {
3275 return Vec128<float>(vcvtq_f32_s32(v.raw));
3276}
3277template <size_t N, HWY_IF_LE64(int32_t, N)>
3279 const Vec128<int32_t, N> v) {
3280 return Vec128<float, N>(vcvt_f32_s32(v.raw));
3281}
3282
3283// Truncates (rounds toward zero).
3285 const Vec128<float> v) {
3286 return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
3287}
3288template <size_t N, HWY_IF_LE64(float, N)>
3290 const Vec128<float, N> v) {
3291 return Vec128<int32_t, N>(vcvt_s32_f32(v.raw));
3292}
3293
3294#if HWY_ARCH_ARM_A64
3295
3296HWY_API Vec128<double> ConvertTo(Full128<double> /* tag */,
3297 const Vec128<int64_t> v) {
3298 return Vec128<double>(vcvtq_f64_s64(v.raw));
3299}
3300HWY_API Vec64<double> ConvertTo(Full64<double> /* tag */,
3301 const Vec64<int64_t> v) {
3302 return Vec64<double>(vcvt_f64_s64(v.raw));
3303}
3304
3305// Truncates (rounds toward zero).
3306HWY_API Vec128<int64_t> ConvertTo(Full128<int64_t> /* tag */,
3307 const Vec128<double> v) {
3308 return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
3309}
3310HWY_API Vec64<int64_t> ConvertTo(Full64<int64_t> /* tag */,
3311 const Vec64<double> v) {
3312 return Vec64<int64_t>(vcvt_s64_f64(v.raw));
3313}
3314
3315#endif
3316
3317// ------------------------------ Round (IfThenElse, mask, logical)
3318
3319#if HWY_ARCH_ARM_A64
3320// Toward nearest integer
3322
3323// Toward zero, aka truncate
3325
3326// Toward +infinity, aka ceiling
3328
3329// Toward -infinity, aka floor
3331#else
3332
3333// ------------------------------ Trunc
3334
3335// ARMv7 only supports truncation to integer. We can either convert back to
3336// float (3 floating-point and 2 logic operations) or manipulate the binary32
3337// representation, clearing the lowest 23-exp mantissa bits. This requires 9
3338// integer operations and 3 constants, which is likely more expensive.
3339
3340namespace detail {
3341
3342// The original value is already the desired result if NaN or the magnitude is
3343// large (i.e. the value is already an integer).
3344template <size_t N>
3347}
3348
3349} // namespace detail
3350
3351template <size_t N>
3353 const DFromV<decltype(v)> df;
3354 const RebindToSigned<decltype(df)> di;
3355
3356 const auto integer = ConvertTo(di, v); // round toward 0
3357 const auto int_f = ConvertTo(df, integer);
3358
3359 return IfThenElse(detail::UseInt(v), int_f, v);
3360}
3361
3362template <size_t N>
3364 const DFromV<decltype(v)> df;
3365
3366 // ARMv7 also lacks a native NearestInt, but we can instead rely on rounding
3367 // (we assume the current mode is nearest-even) after addition with a large
3368 // value such that no mantissa bits remain. We may need a compiler flag for
3369 // precise floating-point to prevent this from being "optimized" out.
3370 const auto max = Set(df, MantissaEnd<float>());
3371 const auto large = CopySignToAbs(max, v);
3372 const auto added = large + v;
3373 const auto rounded = added - large;
3374
3375 // Keep original if NaN or the magnitude is large (already an int).
3376 return IfThenElse(Abs(v) < max, rounded, v);
3377}
3378
3379template <size_t N>
3381 const DFromV<decltype(v)> df;
3382 const RebindToSigned<decltype(df)> di;
3383
3384 const auto integer = ConvertTo(di, v); // round toward 0
3385 const auto int_f = ConvertTo(df, integer);
3386
3387 // Truncating a positive non-integer ends up smaller; if so, add 1.
3388 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
3389
3390 return IfThenElse(detail::UseInt(v), int_f - neg1, v);
3391}
3392
3393template <size_t N>
3395 const DFromV<decltype(v)> df;
3396 const RebindToSigned<decltype(df)> di;
3397
3398 const auto integer = ConvertTo(di, v); // round toward 0
3399 const auto int_f = ConvertTo(df, integer);
3400
3401 // Truncating a negative non-integer ends up larger; if so, subtract 1.
3402 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
3403
3404 return IfThenElse(detail::UseInt(v), int_f + neg1, v);
3405}
3406
3407#endif
3408
3409// ------------------------------ NearestInt (Round)
3410
3411#if HWY_ARCH_ARM_A64
3412
3413HWY_API Vec128<int32_t> NearestInt(const Vec128<float> v) {
3414 return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
3415}
3416template <size_t N, HWY_IF_LE64(float, N)>
3417HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
3418 return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
3419}
3420
3421#else
3422
3423template <size_t N>
3425 const RebindToSigned<DFromV<decltype(v)>> di;
3426 return ConvertTo(di, Round(v));
3427}
3428
3429#endif
3430
3431// ------------------------------ Floating-point classification
3432template <typename T, size_t N>
3434 return v != v;
3435}
3436
3437template <typename T, size_t N, HWY_IF_FLOAT(T)>
3439 const Simd<T, N, 0> d;
3440 const RebindToSigned<decltype(d)> di;
3441 const VFromD<decltype(di)> vi = BitCast(di, v);
3442 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
3443 return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
3444}
3445
3446// Returns whether normal/subnormal/zero.
3447template <typename T, size_t N, HWY_IF_FLOAT(T)>
3449 const Simd<T, N, 0> d;
3450 const RebindToUnsigned<decltype(d)> du;
3451 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
3452 const VFromD<decltype(du)> vu = BitCast(du, v);
3453 // 'Shift left' to clear the sign bit, then right so we can compare with the
3454 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
3455 // negative and non-negative floats would be greater).
3456 const VFromD<decltype(di)> exp =
3457 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
3458 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
3459}
3460
3461// ================================================== SWIZZLE
3462
3463// ------------------------------ LowerHalf
3464
3465// <= 64 bit: just return different type
3466template <typename T, size_t N, HWY_IF_LE64(uint8_t, N)>
3468 return Vec128<T, N / 2>(v.raw);
3469}
3470
3472 return Vec64<uint8_t>(vget_low_u8(v.raw));
3473}
3475 return Vec64<uint16_t>(vget_low_u16(v.raw));
3476}
3478 return Vec64<uint32_t>(vget_low_u32(v.raw));
3479}
3481 return Vec64<uint64_t>(vget_low_u64(v.raw));
3482}
3484 return Vec64<int8_t>(vget_low_s8(v.raw));
3485}
3487 return Vec64<int16_t>(vget_low_s16(v.raw));
3488}
3490 return Vec64<int32_t>(vget_low_s32(v.raw));
3491}
3493 return Vec64<int64_t>(vget_low_s64(v.raw));
3494}
3496 return Vec64<float>(vget_low_f32(v.raw));
3497}
3498#if HWY_ARCH_ARM_A64
3499HWY_API Vec64<double> LowerHalf(const Vec128<double> v) {
3500 return Vec64<double>(vget_low_f64(v.raw));
3501}
3502#endif
3503
3504template <typename T, size_t N>
3506 Vec128<T, N> v) {
3507 return LowerHalf(v);
3508}
3509
3510// ------------------------------ CombineShiftRightBytes
3511
3512// 128-bit
3513template <int kBytes, typename T, class V128 = Vec128<T>>
3515 static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]");
3516 const Repartition<uint8_t, decltype(d)> d8;
3517 uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
3518 return BitCast(d, Vec128<uint8_t>(v8));
3519}
3520
3521// 64-bit
3522template <int kBytes, typename T>
3524 static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]");
3525 const Repartition<uint8_t, decltype(d)> d8;
3526 uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
3527 return BitCast(d, VFromD<decltype(d8)>(v8));
3528}
3529
3530// <= 32-bit defined after ShiftLeftBytes.
3531
3532// ------------------------------ Shift vector by constant #bytes
3533
3534namespace detail {
3535
3536// Partially specialize because kBytes = 0 and >= size are compile errors;
3537// callers replace the latter with 0xFF for easier specialization.
3538template <int kBytes>
3540 // Full
3541 template <class T>
3543 const Full128<T> d;
3544 return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d));
3545 }
3546
3547 // Partial
3548 template <class T, size_t N, HWY_IF_LE64(T, N)>
3550 // Expand to 64-bit so we only use the native EXT instruction.
3551 const Full64<T> d64;
3552 const auto zero64 = Zero(d64);
3553 const decltype(zero64) v64(v.raw);
3554 return Vec128<T, N>(
3555 CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
3556 }
3557};
3558template <>
3560 template <class T, size_t N>
3562 return v;
3563 }
3564};
3565template <>
3566struct ShiftLeftBytesT<0xFF> {
3567 template <class T, size_t N>
3569 return Zero(Simd<T, N, 0>());
3570 }
3571};
3572
3573template <int kBytes>
3575 template <class T, size_t N>
3577 const Simd<T, N, 0> d;
3578 // For < 64-bit vectors, zero undefined lanes so we shift in zeros.
3579 if (N * sizeof(T) < 8) {
3580 constexpr size_t kReg = N * sizeof(T) == 16 ? 16 : 8;
3581 const Simd<T, kReg / sizeof(T), 0> dreg;
3582 v = Vec128<T, N>(
3583 IfThenElseZero(FirstN(dreg, N), VFromD<decltype(dreg)>(v.raw)).raw);
3584 }
3585 return CombineShiftRightBytes<kBytes>(d, Zero(d), v);
3586 }
3587};
3588template <>
3590 template <class T, size_t N>
3592 return v;
3593 }
3594};
3595template <>
3596struct ShiftRightBytesT<0xFF> {
3597 template <class T, size_t N>
3599 return Zero(Simd<T, N, 0>());
3600 }
3601};
3602
3603} // namespace detail
3604
3605template <int kBytes, typename T, size_t N>
3607 return detail::ShiftLeftBytesT < kBytes >= N * sizeof(T) ? 0xFF
3608 : kBytes > ()(v);
3609}
3610
3611template <int kBytes, typename T, size_t N>
3613 return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(), v);
3614}
3615
3616template <int kLanes, typename T, size_t N>
3618 const Repartition<uint8_t, decltype(d)> d8;
3619 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3620}
3621
3622template <int kLanes, typename T, size_t N>
3624 return ShiftLeftLanes<kLanes>(Simd<T, N, 0>(), v);
3625}
3626
3627// 0x01..0F, kBytes = 1 => 0x0001..0E
3628template <int kBytes, typename T, size_t N>
3630 return detail::ShiftRightBytesT < kBytes >= N * sizeof(T) ? 0xFF
3631 : kBytes > ()(v);
3632}
3633
3634template <int kLanes, typename T, size_t N>
3636 const Repartition<uint8_t, decltype(d)> d8;
3637 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
3638}
3639
3640// Calls ShiftLeftBytes
3641template <int kBytes, typename T, size_t N, HWY_IF_LE32(T, N)>
3643 Vec128<T, N> lo) {
3644 constexpr size_t kSize = N * sizeof(T);
3645 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
3646 const Repartition<uint8_t, decltype(d)> d8;
3647 const Full64<uint8_t> d_full8;
3648 const Repartition<T, decltype(d_full8)> d_full;
3649 using V64 = VFromD<decltype(d_full8)>;
3650 const V64 hi64(BitCast(d8, hi).raw);
3651 // Move into most-significant bytes
3652 const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw));
3653 const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64);
3654 // After casting to full 64-bit vector of correct type, shrink to 32-bit
3655 return Vec128<T, N>(BitCast(d_full, r).raw);
3656}
3657
3658// ------------------------------ UpperHalf (ShiftRightBytes)
3659
3660// Full input
3662 const Vec128<uint8_t> v) {
3663 return Vec64<uint8_t>(vget_high_u8(v.raw));
3664}
3666 const Vec128<uint16_t> v) {
3667 return Vec64<uint16_t>(vget_high_u16(v.raw));
3668}
3670 const Vec128<uint32_t> v) {
3671 return Vec64<uint32_t>(vget_high_u32(v.raw));
3672}
3674 const Vec128<uint64_t> v) {
3675 return Vec64<uint64_t>(vget_high_u64(v.raw));
3676}
3678 const Vec128<int8_t> v) {
3679 return Vec64<int8_t>(vget_high_s8(v.raw));
3680}
3682 const Vec128<int16_t> v) {
3683 return Vec64<int16_t>(vget_high_s16(v.raw));
3684}
3686 const Vec128<int32_t> v) {
3687 return Vec64<int32_t>(vget_high_s32(v.raw));
3688}
3690 const Vec128<int64_t> v) {
3691 return Vec64<int64_t>(vget_high_s64(v.raw));
3692}
3694 return Vec64<float>(vget_high_f32(v.raw));
3695}
3696#if HWY_ARCH_ARM_A64
3697HWY_API Vec64<double> UpperHalf(Full64<double> /* tag */,
3698 const Vec128<double> v) {
3699 return Vec64<double>(vget_high_f64(v.raw));
3700}
3701#endif
3702
3703// Partial
3704template <typename T, size_t N, HWY_IF_LE64(T, N)>
3705HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
3706 Vec128<T, N> v) {
3707 const DFromV<decltype(v)> d;
3708 const RebindToUnsigned<decltype(d)> du;
3709 const auto vu = BitCast(du, v);
3710 const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
3711 return Vec128<T, (N + 1) / 2>(upper.raw);
3712}
3713
3714// ------------------------------ Broadcast/splat any lane
3715
3716#if HWY_ARCH_ARM_A64
3717// Unsigned
3718template <int kLane>
3719HWY_API Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
3720 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3721 return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
3722}
3723template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
3724HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
3725 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3726 return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
3727}
3728template <int kLane>
3729HWY_API Vec128<uint32_t> Broadcast(const Vec128<uint32_t> v) {
3730 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3731 return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane));
3732}
3733template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
3734HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
3735 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3736 return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
3737}
3738template <int kLane>
3739HWY_API Vec128<uint64_t> Broadcast(const Vec128<uint64_t> v) {
3740 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3741 return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane));
3742}
3743// Vec64<uint64_t> is defined below.
3744
3745// Signed
3746template <int kLane>
3747HWY_API Vec128<int16_t> Broadcast(const Vec128<int16_t> v) {
3748 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3749 return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane));
3750}
3751template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
3752HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
3753 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3754 return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
3755}
3756template <int kLane>
3757HWY_API Vec128<int32_t> Broadcast(const Vec128<int32_t> v) {
3758 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3759 return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane));
3760}
3761template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
3762HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
3763 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3764 return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
3765}
3766template <int kLane>
3767HWY_API Vec128<int64_t> Broadcast(const Vec128<int64_t> v) {
3768 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3769 return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane));
3770}
3771// Vec64<int64_t> is defined below.
3772
3773// Float
3774template <int kLane>
3775HWY_API Vec128<float> Broadcast(const Vec128<float> v) {
3776 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3777 return Vec128<float>(vdupq_laneq_f32(v.raw, kLane));
3778}
3779template <int kLane, size_t N, HWY_IF_LE64(float, N)>
3780HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
3781 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3782 return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
3783}
3784template <int kLane>
3785HWY_API Vec128<double> Broadcast(const Vec128<double> v) {
3786 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3787 return Vec128<double>(vdupq_laneq_f64(v.raw, kLane));
3788}
3789template <int kLane>
3790HWY_API Vec64<double> Broadcast(const Vec64<double> v) {
3791 static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3792 return v;
3793}
3794
3795#else
3796// No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*.
3797
3798// Unsigned
3799template <int kLane>
3801 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3802 return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane)));
3803}
3804template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
3806 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3807 return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
3808}
3809template <int kLane>
3811 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3812 return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane)));
3813}
3814template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
3816 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3817 return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
3818}
3819template <int kLane>
3821 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3822 return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane)));
3823}
3824// Vec64<uint64_t> is defined below.
3825
3826// Signed
3827template <int kLane>
3829 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3830 return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane)));
3831}
3832template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
3834 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3835 return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
3836}
3837template <int kLane>
3839 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3840 return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane)));
3841}
3842template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
3844 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3845 return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
3846}
3847template <int kLane>
3849 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3850 return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane)));
3851}
3852// Vec64<int64_t> is defined below.
3853
3854// Float
3855template <int kLane>
3857 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3858 return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane)));
3859}
3860template <int kLane, size_t N, HWY_IF_LE64(float, N)>
3862 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3863 return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
3864}
3865
3866#endif
3867
3868template <int kLane>
3870 static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3871 return v;
3872}
3873template <int kLane>
3875 static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3876 return v;
3877}
3878
3879// ------------------------------ TableLookupLanes
3880
3881// Returned by SetTableIndices for use by TableLookupLanes.
3882template <typename T, size_t N>
3885};
3886
3887template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
3889 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
3890#if HWY_IS_DEBUG_BUILD
3891 const Rebind<TI, decltype(d)> di;
3892 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
3893 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
3894#endif
3895
3896 const Repartition<uint8_t, decltype(d)> d8;
3897 using V8 = VFromD<decltype(d8)>;
3898 const Repartition<uint16_t, decltype(d)> d16;
3899
3900 // Broadcast each lane index to all bytes of T and shift to bytes
3901 static_assert(sizeof(T) == 4 || sizeof(T) == 8, "");
3902 if (sizeof(T) == 4) {
3903 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3904 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
3905 const V8 lane_indices =
3906 TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
3907 const V8 byte_indices =
3908 BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
3909 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
3910 0, 1, 2, 3, 0, 1, 2, 3};
3911 const V8 sum = Add(byte_indices, Load(d8, kByteOffsets));
3912 return Indices128<T, N>{BitCast(d, sum).raw};
3913 } else {
3914 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3915 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
3916 const V8 lane_indices =
3917 TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
3918 const V8 byte_indices =
3919 BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices)));
3920 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
3921 0, 1, 2, 3, 4, 5, 6, 7};
3922 const V8 sum = Add(byte_indices, Load(d8, kByteOffsets));
3923 return Indices128<T, N>{BitCast(d, sum).raw};
3924 }
3925}
3926
3927template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
3929 const Rebind<TI, decltype(d)> di;
3930 return IndicesFromVec(d, LoadU(di, idx));
3931}
3932
3933template <typename T, size_t N>
3935 const DFromV<decltype(v)> d;
3936 const RebindToSigned<decltype(d)> di;
3937 return BitCast(
3938 d, TableLookupBytes(BitCast(di, v), BitCast(di, Vec128<T, N>{idx.raw})));
3939}
3940
3941// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
3942
3943// Single lane: no change
3944template <typename T>
3946 return v;
3947}
3948
3949// Two lanes: shuffle
3950template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3952 return Vec128<T, 2>(Shuffle2301(v));
3953}
3954
3955template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3957 return Shuffle01(v);
3958}
3959
3960// Four lanes: shuffle
3961template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3962HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
3963 return Shuffle0123(v);
3964}
3965
3966// 16-bit
3967template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3969 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
3970 return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
3971}
3972
3973// ------------------------------ Reverse2
3974
3975template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
3977 const RebindToUnsigned<decltype(d)> du;
3978 return BitCast(d, Vec128<uint16_t, N>(vrev32_u16(BitCast(du, v).raw)));
3979}
3980template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3982 const RebindToUnsigned<decltype(d)> du;
3983 return BitCast(d, Vec128<uint16_t>(vrev32q_u16(BitCast(du, v).raw)));
3984}
3985
3986template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE64(T, N)>
3987HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
3988 const RebindToUnsigned<decltype(d)> du;
3989 return BitCast(d, Vec128<uint32_t, N>(vrev64_u32(BitCast(du, v).raw)));
3990}
3991template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3992HWY_API Vec128<T> Reverse2(Full128<T> d, const Vec128<T> v) {
3993 const RebindToUnsigned<decltype(d)> du;
3994 return BitCast(d, Vec128<uint32_t>(vrev64q_u32(BitCast(du, v).raw)));
3995}
3996
3997template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3999 return Shuffle01(v);
4000}
4001
4002// ------------------------------ Reverse4
4003
4004template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
4006 const RebindToUnsigned<decltype(d)> du;
4007 return BitCast(d, Vec128<uint16_t, N>(vrev64_u16(BitCast(du, v).raw)));
4008}
4009template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4011 const RebindToUnsigned<decltype(d)> du;
4012 return BitCast(d, Vec128<uint16_t>(vrev64q_u16(BitCast(du, v).raw)));
4013}
4014
4015template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4017 return Shuffle0123(v);
4018}
4019
4020template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4021HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N>) {
4022 HWY_ASSERT(0); // don't have 8 u64 lanes
4023}
4024
4025// ------------------------------ Reverse8
4026
4027template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4029 return Reverse(d, v);
4030}
4031
4032template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
4033HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>, const Vec128<T, N>) {
4034 HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
4035}
4036
4037// ------------------------------ Other shuffles (TableLookupBytes)
4038
4039// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
4040// Shuffle0321 rotates one lane to the right (the previous least-significant
4041// lane is now most-significant). These could also be implemented via
4042// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
4043
4044// Swap 64-bit halves
4045template <typename T>
4047 return CombineShiftRightBytes<8>(Full128<T>(), v, v);
4048}
4049template <typename T>
4051 return CombineShiftRightBytes<8>(Full128<T>(), v, v);
4052}
4053
4054// Rotate right 32 bits
4055template <typename T>
4057 return CombineShiftRightBytes<4>(Full128<T>(), v, v);
4058}
4059
4060// Rotate left 32 bits
4061template <typename T>
4063 return CombineShiftRightBytes<12>(Full128<T>(), v, v);
4064}
4065
4066// Reverse
4067template <typename T>
4069 return Shuffle2301(Shuffle1032(v));
4070}
4071
4072// ------------------------------ InterleaveLower
4073
4074// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
4075// the least-significant lane) and "b". To concatenate two half-width integers
4076// into one, use ZipLower/Upper instead (also works with scalar).
4079
4080#if HWY_ARCH_ARM_A64
4081// N=1 makes no sense (in that case, there would be no upper/lower).
4082HWY_API Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
4083 const Vec128<uint64_t> b) {
4084 return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
4085}
4086HWY_API Vec128<int64_t> InterleaveLower(const Vec128<int64_t> a,
4087 const Vec128<int64_t> b) {
4088 return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
4089}
4090HWY_API Vec128<double> InterleaveLower(const Vec128<double> a,
4091 const Vec128<double> b) {
4092 return Vec128<double>(vzip1q_f64(a.raw, b.raw));
4093}
4094#else
4095// ARMv7 emulation.
4097 const Vec128<uint64_t> b) {
4098 return CombineShiftRightBytes<8>(Full128<uint64_t>(), b, Shuffle01(a));
4099}
4101 const Vec128<int64_t> b) {
4102 return CombineShiftRightBytes<8>(Full128<int64_t>(), b, Shuffle01(a));
4103}
4104#endif
4105
4106// Floats
4108 const Vec128<float> b) {
4109 return Vec128<float>(vzip1q_f32(a.raw, b.raw));
4110}
4111template <size_t N, HWY_IF_LE64(float, N)>
4113 const Vec128<float, N> b) {
4114 return Vec128<float, N>(vzip1_f32(a.raw, b.raw));
4115}
4116
4117// < 64 bit parts
4118template <typename T, size_t N, HWY_IF_LE32(T, N)>
4121}
4122
4123// Additional overload for the optional Simd<> tag.
4124template <typename T, size_t N, class V = Vec128<T, N>>
4125HWY_API V InterleaveLower(Simd<T, N, 0> /* tag */, V a, V b) {
4126 return InterleaveLower(a, b);
4127}
4128
4129// ------------------------------ InterleaveUpper (UpperHalf)
4130
4131// All functions inside detail lack the required D parameter.
4132namespace detail {
4135
4136#if HWY_ARCH_ARM_A64
4137// N=1 makes no sense (in that case, there would be no upper/lower).
4138HWY_API Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a,
4139 const Vec128<uint64_t> b) {
4140 return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
4141}
4142HWY_API Vec128<int64_t> InterleaveUpper(Vec128<int64_t> a, Vec128<int64_t> b) {
4143 return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
4144}
4145HWY_API Vec128<double> InterleaveUpper(Vec128<double> a, Vec128<double> b) {
4146 return Vec128<double>(vzip2q_f64(a.raw, b.raw));
4147}
4148#else
4149// ARMv7 emulation.
4151 const Vec128<uint64_t> b) {
4152 return CombineShiftRightBytes<8>(Full128<uint64_t>(), Shuffle01(b), a);
4153}
4155 return CombineShiftRightBytes<8>(Full128<int64_t>(), Shuffle01(b), a);
4156}
4157#endif
4158
4160 return Vec128<float>(vzip2q_f32(a.raw, b.raw));
4161}
4163 const Vec64<float> b) {
4164 return Vec64<float>(vzip2_f32(a.raw, b.raw));
4165}
4166
4167} // namespace detail
4168
4169// Full register
4170template <typename T, size_t N, HWY_IF_GE64(T, N), class V = Vec128<T, N>>
4171HWY_API V InterleaveUpper(Simd<T, N, 0> /* tag */, V a, V b) {
4172 return detail::InterleaveUpper(a, b);
4173}
4174
4175// Partial
4176template <typename T, size_t N, HWY_IF_LE32(T, N), class V = Vec128<T, N>>
4177HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
4178 const Half<decltype(d)> d2;
4179 return InterleaveLower(d, V(UpperHalf(d2, a).raw), V(UpperHalf(d2, b).raw));
4180}
4181
4182// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
4183
4184// Same as Interleave*, except that the return lanes are double-width integers;
4185// this is necessary because the single-lane scalar cannot return two values.
4186template <class V, class DW = RepartitionToWide<DFromV<V>>>
4188 return BitCast(DW(), InterleaveLower(a, b));
4189}
4190template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
4191HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
4192 return BitCast(dw, InterleaveLower(D(), a, b));
4193}
4194
4195template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
4196HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
4197 return BitCast(dw, InterleaveUpper(D(), a, b));
4198}
4199
4200// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
4201
4202template <size_t N>
4206 const Vec128<float, N> sum0,
4207 Vec128<float, N>& sum1) {
4208 const Repartition<uint16_t, decltype(df32)> du16;
4209 const RebindToUnsigned<decltype(df32)> du32;
4210 const Vec128<uint16_t, 2 * N> zero = Zero(du16);
4211 const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
4212 const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
4213 const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
4214 const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
4215 sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
4216 return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
4217}
4218
4219// ================================================== COMBINE
4220
4221// ------------------------------ Combine (InterleaveLower)
4222
4223// Full result
4225 Vec64<uint8_t> lo) {
4226 return Vec128<uint8_t>(vcombine_u8(lo.raw, hi.raw));
4227}
4230 return Vec128<uint16_t>(vcombine_u16(lo.raw, hi.raw));
4231}
4234 return Vec128<uint32_t>(vcombine_u32(lo.raw, hi.raw));
4235}
4238 return Vec128<uint64_t>(vcombine_u64(lo.raw, hi.raw));
4239}
4240
4242 Vec64<int8_t> lo) {
4243 return Vec128<int8_t>(vcombine_s8(lo.raw, hi.raw));
4244}
4246 Vec64<int16_t> lo) {
4247 return Vec128<int16_t>(vcombine_s16(lo.raw, hi.raw));
4248}
4250 Vec64<int32_t> lo) {
4251 return Vec128<int32_t>(vcombine_s32(lo.raw, hi.raw));
4252}
4254 Vec64<int64_t> lo) {
4255 return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
4256}
4257
4259 Vec64<float> lo) {
4260 return Vec128<float>(vcombine_f32(lo.raw, hi.raw));
4261}
4262#if HWY_ARCH_ARM_A64
4263HWY_API Vec128<double> Combine(Full128<double> /* tag */, Vec64<double> hi,
4264 Vec64<double> lo) {
4265 return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
4266}
4267#endif
4268
4269// < 64bit input, <= 64 bit result
4270template <typename T, size_t N, HWY_IF_LE64(T, N)>
4272 Vec128<T, N / 2> lo) {
4273 // First double N (only lower halves will be used).
4274 const Vec128<T, N> hi2(hi.raw);
4275 const Vec128<T, N> lo2(lo.raw);
4276 // Repartition to two unsigned lanes (each the size of the valid input).
4277 const Simd<UnsignedFromSize<N * sizeof(T) / 2>, 2, 0> du;
4278 return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2)));
4279}
4280
4281// ------------------------------ ZeroExtendVector (Combine)
4282
4283template <typename T, size_t N>
4285 return Combine(d, Zero(Half<decltype(d)>()), lo);
4286}
4287
4288// ------------------------------ ConcatLowerLower
4289
4290// 64 or 128-bit input: just interleave
4291template <typename T, size_t N, HWY_IF_GE64(T, N)>
4293 Vec128<T, N> lo) {
4294 // Treat half-width input as a single lane and interleave them.
4295 const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
4296 return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi)));
4297}
4298
4299namespace detail {
4300#if HWY_ARCH_ARM_A64
4301HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveEven, vtrn1, _, 2)
4302HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveOdd, vtrn2, _, 2)
4303#else
4304
4305// vtrn returns a struct with even and odd result.
4306#define HWY_NEON_BUILD_TPL_HWY_TRN
4307#define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
4308// Pass raw args so we can accept uint16x2 args, for which there is no
4309// corresponding uint16x2x2 return type.
4310#define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
4311 Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
4312#define HWY_NEON_BUILD_ARG_HWY_TRN a, b
4313
4314// Cannot use UINT8 etc. type macros because the x2_t tuples are only defined
4315// for full and half vectors.
4316HWY_NEON_DEF_FUNCTION(uint8, 16, InterleaveEvenOdd, vtrnq, _, u8, HWY_TRN)
4317HWY_NEON_DEF_FUNCTION(uint8, 8, InterleaveEvenOdd, vtrn, _, u8, HWY_TRN)
4318HWY_NEON_DEF_FUNCTION(uint16, 8, InterleaveEvenOdd, vtrnq, _, u16, HWY_TRN)
4319HWY_NEON_DEF_FUNCTION(uint16, 4, InterleaveEvenOdd, vtrn, _, u16, HWY_TRN)
4320HWY_NEON_DEF_FUNCTION(uint32, 4, InterleaveEvenOdd, vtrnq, _, u32, HWY_TRN)
4321HWY_NEON_DEF_FUNCTION(uint32, 2, InterleaveEvenOdd, vtrn, _, u32, HWY_TRN)
4322HWY_NEON_DEF_FUNCTION(int8, 16, InterleaveEvenOdd, vtrnq, _, s8, HWY_TRN)
4323HWY_NEON_DEF_FUNCTION(int8, 8, InterleaveEvenOdd, vtrn, _, s8, HWY_TRN)
4324HWY_NEON_DEF_FUNCTION(int16, 8, InterleaveEvenOdd, vtrnq, _, s16, HWY_TRN)
4325HWY_NEON_DEF_FUNCTION(int16, 4, InterleaveEvenOdd, vtrn, _, s16, HWY_TRN)
4326HWY_NEON_DEF_FUNCTION(int32, 4, InterleaveEvenOdd, vtrnq, _, s32, HWY_TRN)
4327HWY_NEON_DEF_FUNCTION(int32, 2, InterleaveEvenOdd, vtrn, _, s32, HWY_TRN)
4328HWY_NEON_DEF_FUNCTION(float32, 4, InterleaveEvenOdd, vtrnq, _, f32, HWY_TRN)
4329HWY_NEON_DEF_FUNCTION(float32, 2, InterleaveEvenOdd, vtrn, _, f32, HWY_TRN)
4330#endif
4331} // namespace detail
4332
4333// <= 32-bit input/output
4334template <typename T, size_t N, HWY_IF_LE32(T, N)>
4335HWY_API Vec128<T, N> ConcatLowerLower(const Simd<T, N, 0> d, Vec128<T, N> hi,
4336 Vec128<T, N> lo) {
4337 // Treat half-width input as two lanes and take every second one.
4338 const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
4339#if HWY_ARCH_ARM_A64
4340 return BitCast(d, detail::InterleaveEven(BitCast(du, lo), BitCast(du, hi)));
4341#else
4342 using VU = VFromD<decltype(du)>;
4343 return BitCast(
4344 d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
4345 .val[0]));
4346#endif
4347}
4348
4349// ------------------------------ ConcatUpperUpper
4350
4351// 64 or 128-bit input: just interleave
4352template <typename T, size_t N, HWY_IF_GE64(T, N)>
4354 Vec128<T, N> lo) {
4355 // Treat half-width input as a single lane and interleave them.
4356 const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
4357 return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi)));
4358}
4359
4360// <= 32-bit input/output
4361template <typename T, size_t N, HWY_IF_LE32(T, N)>
4362HWY_API Vec128<T, N> ConcatUpperUpper(const Simd<T, N, 0> d, Vec128<T, N> hi,
4363 Vec128<T, N> lo) {
4364 // Treat half-width input as two lanes and take every second one.
4365 const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
4366#if HWY_ARCH_ARM_A64
4367 return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi)));
4368#else
4369 using VU = VFromD<decltype(du)>;
4370 return BitCast(
4371 d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
4372 .val[1]));
4373#endif
4374}
4375
4376// ------------------------------ ConcatLowerUpper (ShiftLeftBytes)
4377
4378// 64 or 128-bit input: extract from concatenated
4379template <typename T, size_t N, HWY_IF_GE64(T, N)>
4381 Vec128<T, N> lo) {
4382 return CombineShiftRightBytes<N * sizeof(T) / 2>(d, hi, lo);
4383}
4384
4385// <= 32-bit input/output
4386template <typename T, size_t N, HWY_IF_LE32(T, N)>
4387HWY_API Vec128<T, N> ConcatLowerUpper(const Simd<T, N, 0> d, Vec128<T, N> hi,
4388 Vec128<T, N> lo) {
4389 constexpr size_t kSize = N * sizeof(T);
4390 const Repartition<uint8_t, decltype(d)> d8;
4391 const Full64<uint8_t> d8x8;
4392 const Full64<T> d64;
4393 using V8x8 = VFromD<decltype(d8x8)>;
4394 const V8x8 hi8x8(BitCast(d8, hi).raw);
4395 // Move into most-significant bytes
4396 const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw));
4397 const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8);
4398 // Back to original lane type, then shrink N.
4399 return Vec128<T, N>(BitCast(d64, r).raw);
4400}
4401
4402// ------------------------------ ConcatUpperLower
4403
4404// Works for all N.
4405template <typename T, size_t N>
4407 Vec128<T, N> lo) {
4408 return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
4409}
4410
4411// ------------------------------ ConcatOdd (InterleaveUpper)
4412
4413namespace detail {
4414// There is no vuzpq_u64.
4417} // namespace detail
4418
4419// Full/half vector
4420template <typename T, size_t N,
4421 hwy::EnableIf<N != 2 && sizeof(T) * N >= 8>* = nullptr>
4422HWY_API Vec128<T, N> ConcatOdd(Simd<T, N, 0> /* tag */, Vec128<T, N> hi,
4423 Vec128<T, N> lo) {
4424 return detail::ConcatOdd(lo, hi);
4425}
4426
4427// 8-bit x4
4428template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4430 Vec128<T, 4> lo) {
4431 const Twice<decltype(d)> d2;
4432 const Repartition<uint16_t, decltype(d2)> dw2;
4433 const VFromD<decltype(d2)> hi2(hi.raw);
4434 const VFromD<decltype(d2)> lo2(lo.raw);
4435 const VFromD<decltype(dw2)> Hx1Lx1 = BitCast(dw2, ConcatOdd(d2, hi2, lo2));
4436 // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
4437 // vcopy_lane_u16, but that's A64-only.
4438 return Vec128<T, 4>(BitCast(d2, ConcatEven(dw2, Hx1Lx1, Hx1Lx1)).raw);
4439}
4440
4441// Any type x2
4442template <typename T>
4444 Vec128<T, 2> lo) {
4445 return InterleaveUpper(d, lo, hi);
4446}
4447
4448// ------------------------------ ConcatEven (InterleaveLower)
4449
4450// Full/half vector
4451template <typename T, size_t N,
4452 hwy::EnableIf<N != 2 && sizeof(T) * N >= 8>* = nullptr>
4454 Vec128<T, N> lo) {
4455 return detail::ConcatEven(lo, hi);
4456}
4457
4458// 8-bit x4
4459template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4461 Vec128<T, 4> lo) {
4462 const Twice<decltype(d)> d2;
4463 const Repartition<uint16_t, decltype(d2)> dw2;
4464 const VFromD<decltype(d2)> hi2(hi.raw);
4465 const VFromD<decltype(d2)> lo2(lo.raw);
4466 const VFromD<decltype(dw2)> Hx0Lx0 = BitCast(dw2, ConcatEven(d2, hi2, lo2));
4467 // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
4468 // vcopy_lane_u16, but that's A64-only.
4469 return Vec128<T, 4>(BitCast(d2, ConcatEven(dw2, Hx0Lx0, Hx0Lx0)).raw);
4470}
4471
4472// Any type x2
4473template <typename T>
4475 Vec128<T, 2> lo) {
4476 return InterleaveLower(d, lo, hi);
4477}
4478
4479// ------------------------------ DupEven (InterleaveLower)
4480
4481template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4483#if HWY_ARCH_ARM_A64
4484 return detail::InterleaveEven(v, v);
4485#else
4486 return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[0]);
4487#endif
4488}
4489
4490template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4491HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
4492 return InterleaveLower(Simd<T, N, 0>(), v, v);
4493}
4494
4495// ------------------------------ DupOdd (InterleaveUpper)
4496
4497template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4499#if HWY_ARCH_ARM_A64
4500 return detail::InterleaveOdd(v, v);
4501#else
4502 return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[1]);
4503#endif
4504}
4505
4506template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4507HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
4508 return InterleaveUpper(Simd<T, N, 0>(), v, v);
4509}
4510
4511// ------------------------------ OddEven (IfThenElse)
4512
4513template <typename T, size_t N>
4515 const Simd<T, N, 0> d;
4516 const Repartition<uint8_t, decltype(d)> d8;
4517 alignas(16) constexpr uint8_t kBytes[16] = {
4518 ((0 / sizeof(T)) & 1) ? 0 : 0xFF, ((1 / sizeof(T)) & 1) ? 0 : 0xFF,
4519 ((2 / sizeof(T)) & 1) ? 0 : 0xFF, ((3 / sizeof(T)) & 1) ? 0 : 0xFF,
4520 ((4 / sizeof(T)) & 1) ? 0 : 0xFF, ((5 / sizeof(T)) & 1) ? 0 : 0xFF,
4521 ((6 / sizeof(T)) & 1) ? 0 : 0xFF, ((7 / sizeof(T)) & 1) ? 0 : 0xFF,
4522 ((8 / sizeof(T)) & 1) ? 0 : 0xFF, ((9 / sizeof(T)) & 1) ? 0 : 0xFF,
4523 ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF,
4524 ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF,
4525 ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF,
4526 };
4527 const auto vec = BitCast(d, Load(d8, kBytes));
4528 return IfThenElse(MaskFromVec(vec), b, a);
4529}
4530
4531// ------------------------------ OddEvenBlocks
4532template <typename T, size_t N>
4534 return even;
4535}
4536
4537// ------------------------------ SwapAdjacentBlocks
4538
4539template <typename T, size_t N>
4541 return v;
4542}
4543
4544// ------------------------------ ReverseBlocks
4545
4546// Single block: no change
4547template <typename T>
4549 return v;
4550}
4551
4552// ------------------------------ ReorderDemote2To (OddEven)
4553
4554template <size_t N>
4557 const RebindToUnsigned<decltype(dbf16)> du16;
4558 const Repartition<uint32_t, decltype(dbf16)> du32;
4559 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
4560 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4561}
4562
4563// ================================================== CRYPTO
4564
4565#if defined(__ARM_FEATURE_AES)
4566
4567// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
4568#ifdef HWY_NATIVE_AES
4569#undef HWY_NATIVE_AES
4570#else
4571#define HWY_NATIVE_AES
4572#endif
4573
4574HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
4575 Vec128<uint8_t> round_key) {
4576 // NOTE: it is important that AESE and AESMC be consecutive instructions so
4577 // they can be fused. AESE includes AddRoundKey, which is a different ordering
4578 // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual
4579 // round key (the compiler will hopefully optimize this for multiple rounds).
4580 return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
4581 round_key;
4582}
4583
4584HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
4585 Vec128<uint8_t> round_key) {
4586 return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
4587}
4588
4589HWY_API Vec128<uint64_t> CLMulLower(Vec128<uint64_t> a, Vec128<uint64_t> b) {
4590 return Vec128<uint64_t>((uint64x2_t)vmull_p64(GetLane(a), GetLane(b)));
4591}
4592
4593HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) {
4594 return Vec128<uint64_t>(
4595 (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
4596}
4597
4598#endif // __ARM_FEATURE_AES
4599
4600// ================================================== MISC
4601
4602template <size_t N>
4604 const Vec128<bfloat16_t, N> v) {
4605 const Rebind<uint16_t, decltype(df32)> du16;
4606 const RebindToSigned<decltype(df32)> di32;
4607 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
4608}
4609
4610// ------------------------------ MulEven (ConcatEven)
4611
4612// Multiplies even lanes (0, 2 ..) and places the double-wide result into
4613// even and the upper half into its odd neighbor lane.
4615 const Full128<int32_t> d;
4616 int32x4_t a_packed = ConcatEven(d, a, a).raw;
4617 int32x4_t b_packed = ConcatEven(d, b, b).raw;
4618 return Vec128<int64_t>(
4619 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
4620}
4622 const Full128<uint32_t> d;
4623 uint32x4_t a_packed = ConcatEven(d, a, a).raw;
4624 uint32x4_t b_packed = ConcatEven(d, b, b).raw;
4625 return Vec128<uint64_t>(
4626 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
4627}
4628
4629template <size_t N>
4630HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
4631 const Vec128<int32_t, N> b) {
4632 const DFromV<decltype(a)> d;
4633 int32x2_t a_packed = ConcatEven(d, a, a).raw;
4634 int32x2_t b_packed = ConcatEven(d, b, b).raw;
4635 return Vec128<int64_t, (N + 1) / 2>(
4636 vget_low_s64(vmull_s32(a_packed, b_packed)));
4637}
4638template <size_t N>
4639HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
4640 const Vec128<uint32_t, N> b) {
4641 const DFromV<decltype(a)> d;
4642 uint32x2_t a_packed = ConcatEven(d, a, a).raw;
4643 uint32x2_t b_packed = ConcatEven(d, b, b).raw;
4644 return Vec128<uint64_t, (N + 1) / 2>(
4645 vget_low_u64(vmull_u32(a_packed, b_packed)));
4646}
4647
4649 uint64_t hi;
4650 uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 0), vgetq_lane_u64(b.raw, 0), &hi);
4651 return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
4652}
4653
4655 uint64_t hi;
4656 uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 1), vgetq_lane_u64(b.raw, 1), &hi);
4657 return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
4658}
4659
4660// ------------------------------ TableLookupBytes (Combine, LowerHalf)
4661
4662// Both full
4663template <typename T, typename TI>
4665 const Vec128<TI> from) {
4666 const Full128<TI> d;
4667 const Repartition<uint8_t, decltype(d)> d8;
4668#if HWY_ARCH_ARM_A64
4669 return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
4670 BitCast(d8, from).raw)));
4671#else
4672 uint8x16_t table0 = BitCast(d8, bytes).raw;
4673 uint8x8x2_t table;
4674 table.val[0] = vget_low_u8(table0);
4675 table.val[1] = vget_high_u8(table0);
4676 uint8x16_t idx = BitCast(d8, from).raw;
4677 uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
4678 uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
4679 return BitCast(d, Vec128<uint8_t>(vcombine_u8(low, hi)));
4680#endif
4681}
4682
4683// Partial index vector
4684template <typename T, typename TI, size_t NI, HWY_IF_LE64(TI, NI)>
4686 const Vec128<TI, NI> from) {
4687 const Full128<TI> d_full;
4688 const Vec64<TI> from64(from.raw);
4689 const auto idx_full = Combine(d_full, from64, from64);
4690 const auto out_full = TableLookupBytes(bytes, idx_full);
4691 return Vec128<TI, NI>(LowerHalf(Half<decltype(d_full)>(), out_full).raw);
4692}
4693
4694// Partial table vector
4695template <typename T, size_t N, typename TI, HWY_IF_LE64(T, N)>
4697 const Vec128<TI> from) {
4698 const Full128<T> d_full;
4699 return TableLookupBytes(Combine(d_full, bytes, bytes), from);
4700}
4701
4702// Partial both
4703template <typename T, size_t N, typename TI, size_t NI, HWY_IF_LE64(T, N),
4704 HWY_IF_LE64(TI, NI)>
4706 Vec128<T, N> bytes, Vec128<TI, NI> from) {
4707 const Simd<T, N, 0> d;
4708 const Simd<TI, NI, 0> d_idx;
4709 const Repartition<uint8_t, decltype(d_idx)> d_idx8;
4710 // uint8x8
4711 const auto bytes8 = BitCast(Repartition<uint8_t, decltype(d)>(), bytes);
4712 const auto from8 = BitCast(d_idx8, from);
4713 const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
4714 return BitCast(d_idx, v8);
4715}
4716
4717// For all vector widths; ARM anyway zeroes if >= 0x10.
4718template <class V, class VI>
4719HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
4720 return TableLookupBytes(bytes, from);
4721}
4722
4723// ------------------------------ Scatter (Store)
4724
4725template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
4727 T* HWY_RESTRICT base,
4728 const Vec128<Offset, N> offset) {
4729 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
4730
4731 alignas(16) T lanes[N];
4732 Store(v, d, lanes);
4733
4734 alignas(16) Offset offset_lanes[N];
4735 Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
4736
4737 uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
4738 for (size_t i = 0; i < N; ++i) {
4739 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
4740 }
4741}
4742
4743template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
4745 const Vec128<Index, N> index) {
4746 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
4747
4748 alignas(16) T lanes[N];
4749 Store(v, d, lanes);
4750
4751 alignas(16) Index index_lanes[N];
4752 Store(index, Rebind<Index, decltype(d)>(), index_lanes);
4753
4754 for (size_t i = 0; i < N; ++i) {
4755 base[index_lanes[i]] = lanes[i];
4756 }
4757}
4758
4759// ------------------------------ Gather (Load/Store)
4760
4761template <typename T, size_t N, typename Offset>
4763 const T* HWY_RESTRICT base,
4764 const Vec128<Offset, N> offset) {
4765 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
4766
4767 alignas(16) Offset offset_lanes[N];
4768 Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
4769
4770 alignas(16) T lanes[N];
4771 const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
4772 for (size_t i = 0; i < N; ++i) {
4773 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
4774 }
4775 return Load(d, lanes);
4776}
4777
4778template <typename T, size_t N, typename Index>
4780 const T* HWY_RESTRICT base,
4781 const Vec128<Index, N> index) {
4782 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
4783
4784 alignas(16) Index index_lanes[N];
4785 Store(index, Rebind<Index, decltype(d)>(), index_lanes);
4786
4787 alignas(16) T lanes[N];
4788 for (size_t i = 0; i < N; ++i) {
4789 lanes[i] = base[index_lanes[i]];
4790 }
4791 return Load(d, lanes);
4792}
4793
4794// ------------------------------ Reductions
4795
4796namespace detail {
4797
4798// N=1 for any T: no-op
4799template <typename T>
4801 return v;
4802}
4803template <typename T>
4805 const Vec128<T, 1> v) {
4806 return v;
4807}
4808template <typename T>
4810 const Vec128<T, 1> v) {
4811 return v;
4812}
4813
4814// u32/i32/f32: N=2
4815template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4817 return v10 + Shuffle2301(v10);
4818}
4819template <typename T>
4821 const Vec128<T, 2> v10) {
4822 return Min(v10, Shuffle2301(v10));
4823}
4824template <typename T>
4826 const Vec128<T, 2> v10) {
4827 return Max(v10, Shuffle2301(v10));
4828}
4829
4830// full vectors
4831#if HWY_ARCH_ARM_A64
4833 return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
4834}
4835HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
4836 return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(v.raw)));
4837}
4838HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
4839 return Vec128<float>(vdupq_n_f32(vaddvq_f32(v.raw)));
4840}
4841HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
4842 return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(v.raw)));
4843}
4844HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
4845 return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(v.raw)));
4846}
4847HWY_INLINE Vec128<double> SumOfLanes(const Vec128<double> v) {
4848 return Vec128<double>(vdupq_n_f64(vaddvq_f64(v.raw)));
4849}
4850#else
4851// ARMv7 version for everything except doubles.
4853 uint32x4x2_t v0 = vuzpq_u32(v.raw, v.raw);
4854 uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
4855 uint32x4x2_t v1 = vuzpq_u32(c0, c0);
4856 return Vec128<uint32_t>(vaddq_u32(v1.val[0], v1.val[1]));
4857}
4859 int32x4x2_t v0 = vuzpq_s32(v.raw, v.raw);
4860 int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
4861 int32x4x2_t v1 = vuzpq_s32(c0, c0);
4862 return Vec128<int32_t>(vaddq_s32(v1.val[0], v1.val[1]));
4863}
4865 float32x4x2_t v0 = vuzpq_f32(v.raw, v.raw);
4866 float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
4867 float32x4x2_t v1 = vuzpq_f32(c0, c0);
4868 return Vec128<float>(vaddq_f32(v1.val[0], v1.val[1]));
4869}
4871 return v + Shuffle01(v);
4872}
4874 return v + Shuffle01(v);
4875}
4876#endif
4877
4878template <typename T>
4880 const Vec128<T> v3210) {
4881 const Vec128<T> v1032 = Shuffle1032(v3210);
4882 const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
4883 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4884 return Min(v20_31_20_31, v31_20_31_20);
4885}
4886template <typename T>
4888 const Vec128<T> v3210) {
4889 const Vec128<T> v1032 = Shuffle1032(v3210);
4890 const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
4891 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4892 return Max(v20_31_20_31, v31_20_31_20);
4893}
4894
4895// For u64/i64[/f64].
4896template <typename T>
4898 const Vec128<T> v10) {
4899 const Vec128<T> v01 = Shuffle01(v10);
4900 return Min(v10, v01);
4901}
4902template <typename T>
4904 const Vec128<T> v10) {
4905 const Vec128<T> v01 = Shuffle01(v10);
4906 return Max(v10, v01);
4907}
4908
4909// u16/i16
4910template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4913 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4914 const auto odd = ShiftRight<16>(BitCast(d32, v));
4915 const auto min = MinOfLanes(d32, Min(even, odd));
4916 // Also broadcast into odd lanes.
4917 return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
4918}
4919template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4922 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4923 const auto odd = ShiftRight<16>(BitCast(d32, v));
4924 const auto min = MaxOfLanes(d32, Max(even, odd));
4925 // Also broadcast into odd lanes.
4926 return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
4927}
4928
4929} // namespace detail
4930
4931template <typename T, size_t N>
4933 return detail::SumOfLanes(v);
4934}
4935template <typename T, size_t N>
4937 return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4938}
4939template <typename T, size_t N>
4941 return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4942}
4943
4944// ------------------------------ LoadMaskBits (TestBit)
4945
4946namespace detail {
4947
4948// Helper function to set 64 bits and potentially return a smaller vector. The
4949// overload is required to call the q vs non-q intrinsics. Note that 8-bit
4950// LoadMaskBits only requires 16 bits, but 64 avoids casting.
4951template <typename T, size_t N, HWY_IF_LE64(T, N)>
4952HWY_INLINE Vec128<T, N> Set64(Simd<T, N, 0> /* tag */, uint64_t mask_bits) {
4953 const auto v64 = Vec64<uint64_t>(vdup_n_u64(mask_bits));
4954 return Vec128<T, N>(BitCast(Full64<T>(), v64).raw);
4955}
4956template <typename T>
4957HWY_INLINE Vec128<T> Set64(Full128<T> d, uint64_t mask_bits) {
4958 return BitCast(d, Vec128<uint64_t>(vdupq_n_u64(mask_bits)));
4959}
4960
4961template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
4963 const RebindToUnsigned<decltype(d)> du;
4964 // Easier than Set(), which would require an >8-bit type, which would not
4965 // compile for T=uint8_t, N=1.
4966 const auto vmask_bits = Set64(du, mask_bits);
4967
4968 // Replicate bytes 8x such that each byte contains the bit that governs it.
4969 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4970 1, 1, 1, 1, 1, 1, 1, 1};
4971 const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8));
4972
4973 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4974 1, 2, 4, 8, 16, 32, 64, 128};
4975 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
4976}
4977
4978template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4980 const RebindToUnsigned<decltype(d)> du;
4981 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4982 const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
4983 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4984}
4985
4986template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4987HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
4988 const RebindToUnsigned<decltype(d)> du;
4989 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
4990 const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
4991 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4992}
4993
4994template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4995HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
4996 const RebindToUnsigned<decltype(d)> du;
4997 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
4998 return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
4999}
5000
5001} // namespace detail
5002
5003// `p` points to at least 8 readable bytes, not all of which need be valid.
5004template <typename T, size_t N, HWY_IF_LE128(T, N)>
5006 const uint8_t* HWY_RESTRICT bits) {
5007 uint64_t mask_bits = 0;
5008 CopyBytes<(N + 7) / 8>(bits, &mask_bits);
5009 return detail::LoadMaskBits(d, mask_bits);
5010}
5011
5012// ------------------------------ Mask
5013
5014namespace detail {
5015
5016// Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than
5017// BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse.
5018template <typename T>
5020 const Full128<uint16_t> du16;
5021 const Vec128<uint16_t> vu16 = BitCast(du16, VecFromMask(d, mask));
5022 const Vec64<uint8_t> nib(vshrn_n_u16(vu16.raw, 4));
5023 return GetLane(BitCast(Full64<uint64_t>(), nib));
5024}
5025
5026template <typename T>
5028 // There is no vshrn_n_u16 for uint16x4, so zero-extend.
5029 const Twice<decltype(d)> d2;
5030 const Vec128<T> v128 = ZeroExtendVector(d2, VecFromMask(d, mask));
5031 // No need to mask, upper half is zero thanks to ZeroExtendVector.
5032 return NibblesFromMask(d2, MaskFromVec(v128));
5033}
5034
5035template <typename T, size_t N, HWY_IF_LE32(T, N)>
5037 const Mask64<T> mask64(mask.raw);
5038 const uint64_t nib = NibblesFromMask(Full64<T>(), mask64);
5039 // Clear nibbles from upper half of 64-bits
5040 constexpr size_t kBytes = sizeof(T) * N;
5041 return nib & ((1ull << (kBytes * 4)) - 1);
5042}
5043
5044template <typename T>
5046 const Mask128<T> mask) {
5047 alignas(16) constexpr uint8_t kSliceLanes[16] = {
5048 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
5049 };
5050 const Full128<uint8_t> du;
5051 const Vec128<uint8_t> values =
5052 BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);
5053
5054#if HWY_ARCH_ARM_A64
5055 // Can't vaddv - we need two separate bytes (16 bits).
5056 const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
5057 const uint8x8_t x4 = vpadd_u8(x2, x2);
5058 const uint8x8_t x8 = vpadd_u8(x4, x4);
5059 return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
5060#else
5061 // Don't have vpaddq, so keep doubling lane size.
5062 const uint16x8_t x2 = vpaddlq_u8(values.raw);
5063 const uint32x4_t x4 = vpaddlq_u16(x2);
5064 const uint64x2_t x8 = vpaddlq_u32(x4);
5065 return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
5066#endif
5067}
5068
5069template <typename T, size_t N, HWY_IF_LE64(T, N)>
5071 const Mask128<T, N> mask) {
5072 // Upper lanes of partial loads are undefined. OnlyActive will fix this if
5073 // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
5074 alignas(8) constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
5075 0x10, 0x20, 0x40, 0x80};
5076 const Simd<T, N, 0> d;
5077 const RebindToUnsigned<decltype(d)> du;
5078 const Vec128<uint8_t, N> slice(Load(Full64<uint8_t>(), kSliceLanes).raw);
5079 const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
5080
5081#if HWY_ARCH_ARM_A64
5082 return vaddv_u8(values.raw);
5083#else
5084 const uint16x4_t x2 = vpaddl_u8(values.raw);
5085 const uint32x2_t x4 = vpaddl_u16(x2);
5086 const uint64x1_t x8 = vpaddl_u32(x4);
5087 return vget_lane_u64(x8, 0);
5088#endif
5089}
5090
5091template <typename T>
5093 const Mask128<T> mask) {
5094 alignas(16) constexpr uint16_t kSliceLanes[8] = {1, 2, 4, 8,
5095 0x10, 0x20, 0x40, 0x80};
5096 const Full128<T> d;
5097 const Full128<uint16_t> du;
5098 const Vec128<uint16_t> values =
5099 BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
5100#if HWY_ARCH_ARM_A64
5101 return vaddvq_u16(values.raw);
5102#else
5103 const uint32x4_t x2 = vpaddlq_u16(values.raw);
5104 const uint64x2_t x4 = vpaddlq_u32(x2);
5105 return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
5106#endif
5107}
5108
5109template <typename T, size_t N, HWY_IF_LE64(T, N)>
5111 const Mask128<T, N> mask) {
5112 // Upper lanes of partial loads are undefined. OnlyActive will fix this if
5113 // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
5114 alignas(8) constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
5115 const Simd<T, N, 0> d;
5116 const RebindToUnsigned<decltype(d)> du;
5117 const Vec128<uint16_t, N> slice(Load(Full64<uint16_t>(), kSliceLanes).raw);
5118 const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
5119#if HWY_ARCH_ARM_A64
5120 return vaddv_u16(values.raw);
5121#else
5122 const uint32x2_t x2 = vpaddl_u16(values.raw);
5123 const uint64x1_t x4 = vpaddl_u32(x2);
5124 return vget_lane_u64(x4, 0);
5125#endif
5126}
5127
5128template <typename T>
5130 const Mask128<T> mask) {
5131 alignas(16) constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
5132 const Full128<T> d;
5133 const Full128<uint32_t> du;
5134 const Vec128<uint32_t> values =
5135 BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
5136#if HWY_ARCH_ARM_A64
5137 return vaddvq_u32(values.raw);
5138#else
5139 const uint64x2_t x2 = vpaddlq_u32(values.raw);
5140 return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
5141#endif
5142}
5143
5144template <typename T, size_t N, HWY_IF_LE64(T, N)>
5146 const Mask128<T, N> mask) {
5147 // Upper lanes of partial loads are undefined. OnlyActive will fix this if
5148 // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
5149 alignas(8) constexpr uint32_t kSliceLanes[2] = {1, 2};
5150 const Simd<T, N, 0> d;
5151 const RebindToUnsigned<decltype(d)> du;
5152 const Vec128<uint32_t, N> slice(Load(Full64<uint32_t>(), kSliceLanes).raw);
5153 const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
5154#if HWY_ARCH_ARM_A64
5155 return vaddv_u32(values.raw);
5156#else
5157 const uint64x1_t x2 = vpaddl_u32(values.raw);
5158 return vget_lane_u64(x2, 0);
5159#endif
5160}
5161
5162template <typename T>
5164 alignas(16) constexpr uint64_t kSliceLanes[2] = {1, 2};
5165 const Full128<T> d;
5166 const Full128<uint64_t> du;
5167 const Vec128<uint64_t> values =
5168 BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
5169#if HWY_ARCH_ARM_A64
5170 return vaddvq_u64(values.raw);
5171#else
5172 return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
5173#endif
5174}
5175
5176template <typename T>
5178 const Mask128<T, 1> m) {
5179 const Full64<T> d;
5180 const Full64<uint64_t> du;
5181 const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, m)) & Set(du, 1);
5182 return vget_lane_u64(values.raw, 0);
5183}
5184
5185// Returns the lowest N for the BitsFromMask result.
5186template <typename T, size_t N>
5187constexpr uint64_t OnlyActive(uint64_t bits) {
5188 return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1));
5189}
5190
5191template <typename T, size_t N>
5193 return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
5194}
5195
5196// Returns number of lanes whose mask is set.
5197//
5198// Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op
5199// ("vsubv"). ANDing with 1 would work but requires a constant. Negating also
5200// changes each lane to 1 (if mask set) or 0.
5201// NOTE: PopCount also operates on vectors, so we still have to do horizontal
5202// sums separately. We specialize CountTrue for full vectors (negating instead
5203// of PopCount because it avoids an extra shift), and use PopCount of
5204// NibblesFromMask for partial vectors.
5205
5206template <typename T>
5207HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> mask) {
5208 const Full128<int8_t> di;
5209 const int8x16_t ones =
5210 vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
5211
5212#if HWY_ARCH_ARM_A64
5213 return static_cast<size_t>(vaddvq_s8(ones));
5214#else
5215 const int16x8_t x2 = vpaddlq_s8(ones);
5216 const int32x4_t x4 = vpaddlq_s16(x2);
5217 const int64x2_t x8 = vpaddlq_s32(x4);
5218 return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
5219#endif
5220}
5221template <typename T>
5222HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> mask) {
5223 const Full128<int16_t> di;
5224 const int16x8_t ones =
5225 vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
5226
5227#if HWY_ARCH_ARM_A64
5228 return static_cast<size_t>(vaddvq_s16(ones));
5229#else
5230 const int32x4_t x2 = vpaddlq_s16(ones);
5231 const int64x2_t x4 = vpaddlq_s32(x2);
5232 return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
5233#endif
5234}
5235
5236template <typename T>
5237HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> mask) {
5238 const Full128<int32_t> di;
5239 const int32x4_t ones =
5240 vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
5241
5242#if HWY_ARCH_ARM_A64
5243 return static_cast<size_t>(vaddvq_s32(ones));
5244#else
5245 const int64x2_t x2 = vpaddlq_s32(ones);
5246 return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
5247#endif
5248}
5249
5250template <typename T>
5251HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> mask) {
5252#if HWY_ARCH_ARM_A64
5253 const Full128<int64_t> di;
5254 const int64x2_t ones =
5255 vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
5256 return static_cast<size_t>(vaddvq_s64(ones));
5257#else
5258 const Full128<uint64_t> du;
5259 const auto mask_u = VecFromMask(du, RebindMask(du, mask));
5260 const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
5261 return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
5262#endif
5263}
5264
5265} // namespace detail
5266
5267// Full
5268template <typename T>
5269HWY_API size_t CountTrue(Full128<T> /* tag */, const Mask128<T> mask) {
5270 return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), mask);
5271}
5272
5273// Partial
5274template <typename T, size_t N, HWY_IF_LE64(T, N)>
5276 constexpr int kDiv = 4 * sizeof(T);
5277 return PopCount(detail::NibblesFromMask(d, mask)) / kDiv;
5278}
5279template <typename T, size_t N>
5281 const Mask128<T, N> mask) {
5282 const uint64_t nib = detail::NibblesFromMask(d, mask);
5283 if (nib == 0) return -1;
5284 constexpr int kDiv = 4 * sizeof(T);
5285 return static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv);
5286}
5287
5288// `p` points to at least 8 writable bytes.
5289template <typename T, size_t N>
5291 uint8_t* bits) {
5292 const uint64_t mask_bits = detail::BitsFromMask(mask);
5293 const size_t kNumBytes = (N + 7) / 8;
5294 CopyBytes<kNumBytes>(&mask_bits, bits);
5295 return kNumBytes;
5296}
5297
5298template <typename T, size_t N>
5300 return detail::NibblesFromMask(d, m) == 0;
5301}
5302
5303// Full
5304template <typename T>
5305HWY_API bool AllTrue(const Full128<T> d, const Mask128<T> m) {
5306 return detail::NibblesFromMask(d, m) == ~0ull;
5307}
5308// Partial
5309template <typename T, size_t N, HWY_IF_LE64(T, N)>
5311 constexpr size_t kBytes = sizeof(T) * N;
5312 return detail::NibblesFromMask(d, m) == (1ull << (kBytes * 4)) - 1;
5313}
5314
5315// ------------------------------ Compress
5316
5317template <typename T>
5319 enum { value = 1 };
5320};
5321
5322namespace detail {
5323
5324// Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
5326 const uint8_t* bytes) {
5327 return Vec128<uint8_t>(vreinterpretq_u8_u64(
5328 vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
5329}
5330
5331// Load 8 bytes and return half-reg with N <= 8 bytes.
5332template <size_t N, HWY_IF_LE64(uint8_t, N)>
5334 const uint8_t* bytes) {
5335 return Load(d, bytes);
5336}
5337
5338template <typename T, size_t N>
5340 const uint64_t mask_bits) {
5341 HWY_DASSERT(mask_bits < 256);
5342 const Simd<T, N, 0> d;
5343 const Repartition<uint8_t, decltype(d)> d8;
5344 const Simd<uint16_t, N, 0> du;
5345
5346 // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
5347 // indices for VTBL (one vector's worth for each of 256 combinations of
5348 // 8 mask bits). Loading them directly would require 4 KiB. We can instead
5349 // store lane indices and convert to byte indices (2*lane + 0..1), with the
5350 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
5351 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
5352 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
5353 // is likely more costly than the higher cache footprint from storing bytes.
5354 alignas(16) constexpr uint8_t table[256 * 8] = {
5355 // PrintCompress16x8Tables
5356 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5357 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5358 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
5359 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5360 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
5361 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
5362 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
5363 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5364 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
5365 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
5366 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
5367 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
5368 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
5369 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
5370 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
5371 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5372 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
5373 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
5374 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
5375 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
5376 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
5377 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
5378 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
5379 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
5380 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
5381 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
5382 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
5383 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
5384 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
5385 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
5386 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
5387 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5388 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
5389 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
5390 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
5391 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
5392 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
5393 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
5394 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
5395 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
5396 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
5397 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
5398 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
5399 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
5400 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
5401 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
5402 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
5403 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
5404 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
5405 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
5406 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
5407 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
5408 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
5409 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
5410 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
5411 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
5412 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
5413 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
5414 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
5415 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
5416 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
5417 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
5418 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
5419 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5420 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
5421 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
5422 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
5423 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
5424 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
5425 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
5426 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
5427 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
5428 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
5429 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
5430 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
5431 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
5432 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
5433 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
5434 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
5435 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
5436 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
5437 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
5438 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
5439 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
5440 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
5441 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
5442 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
5443 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
5444 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
5445 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
5446 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
5447 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
5448 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
5449 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
5450 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
5451 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
5452 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
5453 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
5454 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
5455 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
5456 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
5457 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
5458 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
5459 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
5460 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
5461 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
5462 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
5463 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
5464 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
5465 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
5466 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
5467 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
5468 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
5469 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
5470 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
5471 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
5472 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
5473 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
5474 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
5475 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
5476 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
5477 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
5478 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
5479 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
5480 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
5481 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
5482 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
5483 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5484
5485 const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
5486 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
5487 return BitCast(d, pairs + Set(du, 0x0100));
5488}
5489
5490template <typename T, size_t N>
5492 const uint64_t mask_bits) {
5493 HWY_DASSERT(mask_bits < 256);
5494 const Simd<T, N, 0> d;
5495 const Repartition<uint8_t, decltype(d)> d8;
5496 const Simd<uint16_t, N, 0> du;
5497
5498 // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
5499 // indices for VTBL (one vector's worth for each of 256 combinations of
5500 // 8 mask bits). Loading them directly would require 4 KiB. We can instead
5501 // store lane indices and convert to byte indices (2*lane + 0..1), with the
5502 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
5503 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
5504 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
5505 // is likely more costly than the higher cache footprint from storing bytes.
5506 alignas(16) constexpr uint8_t table[256 * 8] = {
5507 // PrintCompressNot16x8Tables
5508 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, //
5509 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, //
5510 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, //
5511 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, //
5512 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, //
5513 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, //
5514 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, //
5515 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, //
5516 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, //
5517 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, //
5518 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, //
5519 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, //
5520 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, //
5521 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, //
5522 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, //
5523 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, //
5524 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, //
5525 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, //
5526 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, //
5527 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, //
5528 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, //
5529 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, //
5530 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, //
5531 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, //
5532 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, //
5533 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, //
5534 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, //
5535 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, //
5536 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, //
5537 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, //
5538 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, //
5539 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, //
5540 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, //
5541 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, //
5542 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, //
5543 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, //
5544 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, //
5545 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, //
5546 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, //
5547 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, //
5548 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, //
5549 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, //
5550 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, //
5551 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, //
5552 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, //
5553 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, //
5554 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, //
5555 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, //
5556 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, //
5557 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, //
5558 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, //
5559 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, //
5560 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, //
5561 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, //
5562 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, //
5563 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, //
5564 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, //
5565 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, //
5566 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, //
5567 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, //
5568 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, //
5569 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, //
5570 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, //
5571 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, //
5572 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, //
5573 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, //
5574 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, //
5575 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, //
5576 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, //
5577 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, //
5578 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, //
5579 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, //
5580 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, //
5581 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, //
5582 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, //
5583 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, //
5584 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, //
5585 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, //
5586 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, //
5587 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, //
5588 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, //
5589 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, //
5590 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, //
5591 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, //
5592 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, //
5593 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, //
5594 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, //
5595 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, //
5596 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, //
5597 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, //
5598 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, //
5599 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, //
5600 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, //
5601 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, //
5602 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, //
5603 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, //
5604 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, //
5605 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, //
5606 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, //
5607 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, //
5608 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, //
5609 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, //
5610 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, //
5611 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, //
5612 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, //
5613 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, //
5614 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, //
5615 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, //
5616 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, //
5617 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, //
5618 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, //
5619 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, //
5620 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, //
5621 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, //
5622 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, //
5623 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, //
5624 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, //
5625 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, //
5626 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, //
5627 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, //
5628 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, //
5629 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, //
5630 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, //
5631 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, //
5632 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, //
5633 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, //
5634 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, //
5635 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
5636
5637 const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
5638 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
5639 return BitCast(d, pairs + Set(du, 0x0100));
5640}
5641
5642template <typename T, size_t N>
5644 const uint64_t mask_bits) {
5645 HWY_DASSERT(mask_bits < 16);
5646
5647 // There are only 4 lanes, so we can afford to load the index vector directly.
5648 alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
5649 // PrintCompress32x4Tables
5650 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5651 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5652 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
5653 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5654 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
5655 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
5656 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
5657 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5658 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
5659 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
5660 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
5661 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
5662 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
5663 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
5664 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
5665 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5666 const Simd<T, N, 0> d;
5667 const Repartition<uint8_t, decltype(d)> d8;
5668 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5669}
5670
5671template <typename T, size_t N>
5673 const uint64_t mask_bits) {
5674 HWY_DASSERT(mask_bits < 16);
5675
5676 // There are only 4 lanes, so we can afford to load the index vector directly.
5677 alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
5678 // PrintCompressNot32x4Tables
5679 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
5680 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
5681 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
5682 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5683 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
5684 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
5685 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5686 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5687 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
5688 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
5689 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
5690 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
5691 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5692 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5693 12, 13, 14, 15};
5694 const Simd<T, N, 0> d;
5695 const Repartition<uint8_t, decltype(d)> d8;
5696 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5697}
5698
5699#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
5700
5701template <typename T, size_t N>
5703 const uint64_t mask_bits) {
5704 HWY_DASSERT(mask_bits < 4);
5705
5706 // There are only 2 lanes, so we can afford to load the index vector directly.
5707 alignas(16) constexpr uint8_t u8_indices[64] = {
5708 // PrintCompress64x2Tables
5709 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5710 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5711 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5712 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5713
5714 const Simd<T, N, 0> d;
5715 const Repartition<uint8_t, decltype(d)> d8;
5716 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5717}
5718
5719template <typename T, size_t N>
5720HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<8> /*tag*/,
5721 const uint64_t mask_bits) {
5722 HWY_DASSERT(mask_bits < 4);
5723
5724 // There are only 2 lanes, so we can afford to load the index vector directly.
5725 alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
5726 // PrintCompressNot64x2Tables
5727 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5728 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5729 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5730 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5731
5732 const Simd<T, N, 0> d;
5733 const Repartition<uint8_t, decltype(d)> d8;
5734 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5735}
5736
5737#endif
5738
5739// Helper function called by both Compress and CompressStore - avoids a
5740// redundant BitsFromMask in the latter.
5741template <typename T, size_t N>
5742HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
5743 const auto idx =
5744 detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
5745 using D = Simd<T, N, 0>;
5746 const RebindToSigned<D> di;
5747 return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
5748}
5749
5750template <typename T, size_t N>
5752 const auto idx =
5753 detail::IdxFromNotBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
5754 using D = Simd<T, N, 0>;
5755 const RebindToSigned<D> di;
5756 return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
5757}
5758
5759} // namespace detail
5760
5761// Single lane: no-op
5762template <typename T>
5764 return v;
5765}
5766
5767// Two lanes: conditional swap
5768template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
5770 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
5771 const Simd<T, N, 0> d;
5772 const Vec128<T, N> m = VecFromMask(d, mask);
5773 const Vec128<T, N> maskL = DupEven(m);
5774 const Vec128<T, N> maskH = DupOdd(m);
5775 const Vec128<T, N> swap = AndNot(maskL, maskH);
5776 return IfVecThenElse(swap, Shuffle01(v), v);
5777}
5778
5779// General case
5780template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
5781HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
5783}
5784
5785// Single lane: no-op
5786template <typename T>
5788 return v;
5789}
5790
5791// Two lanes: conditional swap
5792template <typename T, HWY_IF_LANE_SIZE(T, 8)>
5794 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
5795 const Full128<T> d;
5796 const Vec128<T> m = VecFromMask(d, mask);
5797 const Vec128<T> maskL = DupEven(m);
5798 const Vec128<T> maskH = DupOdd(m);
5799 const Vec128<T> swap = AndNot(maskH, maskL);
5800 return IfVecThenElse(swap, Shuffle01(v), v);
5801}
5802
5803// General case
5804template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
5806 // For partial vectors, we cannot pull the Not() into the table because
5807 // BitsFromMask clears the upper bits.
5808 if (N < 16 / sizeof(T)) {
5810 }
5812}
5813
5814// ------------------------------ CompressBlocksNot
5816 Mask128<uint64_t> /* m */) {
5817 return v;
5818}
5819
5820// ------------------------------ CompressBits
5821
5822template <typename T, size_t N>
5824 const uint8_t* HWY_RESTRICT bits) {
5825 uint64_t mask_bits = 0;
5826 constexpr size_t kNumBytes = (N + 7) / 8;
5827 CopyBytes<kNumBytes>(bits, &mask_bits);
5828 if (N < 8) {
5829 mask_bits &= (1ull << N) - 1;
5830 }
5831
5832 return detail::Compress(v, mask_bits);
5833}
5834
5835// ------------------------------ CompressStore
5836template <typename T, size_t N>
5838 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
5839 const uint64_t mask_bits = detail::BitsFromMask(mask);
5840 StoreU(detail::Compress(v, mask_bits), d, unaligned);
5841 return PopCount(mask_bits);
5842}
5843
5844// ------------------------------ CompressBlendedStore
5845template <typename T, size_t N>
5848 T* HWY_RESTRICT unaligned) {
5849 const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
5850 using TU = TFromD<decltype(du)>;
5851 const uint64_t mask_bits = detail::BitsFromMask(m);
5852 const size_t count = PopCount(mask_bits);
5853 const Mask128<T, N> store_mask = RebindMask(d, FirstN(du, count));
5854 const Vec128<TU, N> compressed = detail::Compress(BitCast(du, v), mask_bits);
5855 BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
5856 return count;
5857}
5858
5859// ------------------------------ CompressBitsStore
5860
5861template <typename T, size_t N>
5863 const uint8_t* HWY_RESTRICT bits,
5864 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
5865 uint64_t mask_bits = 0;
5866 constexpr size_t kNumBytes = (N + 7) / 8;
5867 CopyBytes<kNumBytes>(bits, &mask_bits);
5868 if (N < 8) {
5869 mask_bits &= (1ull << N) - 1;
5870 }
5871
5872 StoreU(detail::Compress(v, mask_bits), d, unaligned);
5873 return PopCount(mask_bits);
5874}
5875
5876// ------------------------------ LoadInterleaved2
5877
5878// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
5879#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
5880#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
5881#else
5882#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
5883#endif
5884
5885namespace detail {
5886#define HWY_NEON_BUILD_TPL_HWY_LOAD_INT
5887#define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from
5888
5889#if HWY_ARCH_ARM_A64
5890#define HWY_IF_LOAD_INT(T, N) HWY_IF_GE64(T, N)
5891#define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
5892#else
5893// Exclude 64x2 and f64x1, which are only supported on aarch64
5894#define HWY_IF_LOAD_INT(T, N) \
5895 hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
5896#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
5897 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
5898 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
5899 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
5900 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
5901 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
5902#endif // HWY_ARCH_ARM_A64
5903
5904// Must return raw tuple because Tuple2 lack a ctor, and we cannot use
5905// brace-initialization in HWY_NEON_DEF_FUNCTION because some functions return
5906// void.
5907#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
5908 decltype(Tuple2<type##_t, size>().raw)
5909// Tuple tag arg allows overloading (cannot just overload on return type)
5910#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
5911 const type##_t *from, Tuple2<type##_t, size>
5913#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
5914#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
5915
5916#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
5917 decltype(Tuple3<type##_t, size>().raw)
5918#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
5919 const type##_t *from, Tuple3<type##_t, size>
5921#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
5922#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
5923
5924#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
5925 decltype(Tuple4<type##_t, size>().raw)
5926#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
5927 const type##_t *from, Tuple4<type##_t, size>
5929#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
5930#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
5931
5932#undef HWY_NEON_DEF_FUNCTION_LOAD_INT
5933#undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT
5934#undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT
5935} // namespace detail
5936
5937template <typename T, size_t N, HWY_IF_LOAD_INT(T, N)>
5939 const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
5940 Vec128<T, N>& v1) {
5941 auto raw = detail::LoadInterleaved2(unaligned, detail::Tuple2<T, N>());
5942 v0 = Vec128<T, N>(raw.val[0]);
5943 v1 = Vec128<T, N>(raw.val[1]);
5944}
5945
5946// <= 32 bits: avoid loading more than N bytes by copying to buffer
5947template <typename T, size_t N, HWY_IF_LE32(T, N)>
5948HWY_API void LoadInterleaved2(Simd<T, N, 0> /*tag*/,
5949 const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
5950 Vec128<T, N>& v1) {
5951 // The smallest vector registers are 64-bits and we want space for two.
5952 alignas(16) T buf[2 * 8 / sizeof(T)] = {};
5953 CopyBytes<N * 2 * sizeof(T)>(unaligned, buf);
5954 auto raw = detail::LoadInterleaved2(buf, detail::Tuple2<T, N>());
5955 v0 = Vec128<T, N>(raw.val[0]);
5956 v1 = Vec128<T, N>(raw.val[1]);
5957}
5958
5959#if HWY_ARCH_ARM_V7
5960// 64x2: split into two 64x1
5961template <typename T, HWY_IF_LANE_SIZE(T, 8)>
5962HWY_API void LoadInterleaved2(Full128<T> d, T* HWY_RESTRICT unaligned,
5963 Vec128<T>& v0, Vec128<T>& v1) {
5964 const Half<decltype(d)> dh;
5965 VFromD<decltype(dh)> v00, v10, v01, v11;
5966 LoadInterleaved2(dh, unaligned, v00, v10);
5967 LoadInterleaved2(dh, unaligned + 2, v01, v11);
5968 v0 = Combine(d, v01, v00);
5969 v1 = Combine(d, v11, v10);
5970}
5971#endif // HWY_ARCH_ARM_V7
5972
5973// ------------------------------ LoadInterleaved3
5974
5975template <typename T, size_t N, HWY_IF_LOAD_INT(T, N)>
5977 const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
5978 Vec128<T, N>& v1, Vec128<T, N>& v2) {
5979 auto raw = detail::LoadInterleaved3(unaligned, detail::Tuple3<T, N>());
5980 v0 = Vec128<T, N>(raw.val[0]);
5981 v1 = Vec128<T, N>(raw.val[1]);
5982 v2 = Vec128<T, N>(raw.val[2]);
5983}
5984
5985// <= 32 bits: avoid writing more than N bytes by copying to buffer
5986template <typename T, size_t N, HWY_IF_LE32(T, N)>
5987HWY_API void LoadInterleaved3(Simd<T, N, 0> /*tag*/,
5988 const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
5989 Vec128<T, N>& v1, Vec128<T, N>& v2) {
5990 // The smallest vector registers are 64-bits and we want space for three.
5991 alignas(16) T buf[3 * 8 / sizeof(T)] = {};
5992 CopyBytes<N * 3 * sizeof(T)>(unaligned, buf);
5993 auto raw = detail::LoadInterleaved3(buf, detail::Tuple3<T, N>());
5994 v0 = Vec128<T, N>(raw.val[0]);
5995 v1 = Vec128<T, N>(raw.val[1]);
5996 v2 = Vec128<T, N>(raw.val[2]);
5997}
5998
5999#if HWY_ARCH_ARM_V7
6000// 64x2: split into two 64x1
6001template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6002HWY_API void LoadInterleaved3(Full128<T> d, const T* HWY_RESTRICT unaligned,
6003 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
6004 const Half<decltype(d)> dh;
6005 VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21;
6006 LoadInterleaved3(dh, unaligned, v00, v10, v20);
6007 LoadInterleaved3(dh, unaligned + 3, v01, v11, v21);
6008 v0 = Combine(d, v01, v00);
6009 v1 = Combine(d, v11, v10);
6010 v2 = Combine(d, v21, v20);
6011}
6012#endif // HWY_ARCH_ARM_V7
6013
6014// ------------------------------ LoadInterleaved4
6015
6016template <typename T, size_t N, HWY_IF_LOAD_INT(T, N)>
6018 const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
6019 Vec128<T, N>& v1, Vec128<T, N>& v2,
6020 Vec128<T, N>& v3) {
6021 auto raw = detail::LoadInterleaved4(unaligned, detail::Tuple4<T, N>());
6022 v0 = Vec128<T, N>(raw.val[0]);
6023 v1 = Vec128<T, N>(raw.val[1]);
6024 v2 = Vec128<T, N>(raw.val[2]);
6025 v3 = Vec128<T, N>(raw.val[3]);
6026}
6027
6028// <= 32 bits: avoid writing more than N bytes by copying to buffer
6029template <typename T, size_t N, HWY_IF_LE32(T, N)>
6030HWY_API void LoadInterleaved4(Simd<T, N, 0> /*tag*/,
6031 const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
6032 Vec128<T, N>& v1, Vec128<T, N>& v2,
6033 Vec128<T, N>& v3) {
6034 alignas(16) T buf[4 * 8 / sizeof(T)] = {};
6035 CopyBytes<N * 4 * sizeof(T)>(unaligned, buf);
6036 auto raw = detail::LoadInterleaved4(buf, detail::Tuple4<T, N>());
6037 v0 = Vec128<T, N>(raw.val[0]);
6038 v1 = Vec128<T, N>(raw.val[1]);
6039 v2 = Vec128<T, N>(raw.val[2]);
6040 v3 = Vec128<T, N>(raw.val[3]);
6041}
6042
6043#if HWY_ARCH_ARM_V7
6044// 64x2: split into two 64x1
6045template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6046HWY_API void LoadInterleaved4(Full128<T> d, const T* HWY_RESTRICT unaligned,
6047 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
6048 Vec128<T>& v3) {
6049 const Half<decltype(d)> dh;
6050 VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
6051 LoadInterleaved4(dh, unaligned, v00, v10, v20, v30);
6052 LoadInterleaved4(dh, unaligned + 4, v01, v11, v21, v31);
6053 v0 = Combine(d, v01, v00);
6054 v1 = Combine(d, v11, v10);
6055 v2 = Combine(d, v21, v20);
6056 v3 = Combine(d, v31, v30);
6057}
6058#endif // HWY_ARCH_ARM_V7
6059
6060#undef HWY_IF_LOAD_INT
6061
6062// ------------------------------ StoreInterleaved2
6063
6064namespace detail {
6065#define HWY_NEON_BUILD_TPL_HWY_STORE_INT
6066#define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void
6067#define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw
6068
6069#if HWY_ARCH_ARM_A64
6070#define HWY_IF_STORE_INT(T, N) HWY_IF_GE64(T, N)
6071#define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
6072#else
6073// Exclude 64x2 and f64x1, which are only supported on aarch64
6074#define HWY_IF_STORE_INT(T, N) \
6075 hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
6076#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
6077 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
6078 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
6079 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
6080 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
6081 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
6082#endif // HWY_ARCH_ARM_A64
6083
6084#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6085 Tuple2<type##_t, size> tup, type##_t *to
6087#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6088
6089#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6090 Tuple3<type##_t, size> tup, type##_t *to
6092#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6093
6094#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6095 Tuple4<type##_t, size> tup, type##_t *to
6097#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6098
6099#undef HWY_NEON_DEF_FUNCTION_STORE_INT
6100#undef HWY_NEON_BUILD_TPL_HWY_STORE_INT
6101#undef HWY_NEON_BUILD_RET_HWY_STORE_INT
6102#undef HWY_NEON_BUILD_ARG_HWY_STORE_INT
6103} // namespace detail
6104
6105template <typename T, size_t N, HWY_IF_STORE_INT(T, N)>
6107 Simd<T, N, 0> /*tag*/,
6108 T* HWY_RESTRICT unaligned) {
6109 detail::Tuple2<T, N> tup = {{{v0.raw, v1.raw}}};
6110 detail::StoreInterleaved2(tup, unaligned);
6111}
6112
6113// <= 32 bits: avoid writing more than N bytes by copying to buffer
6114template <typename T, size_t N, HWY_IF_LE32(T, N)>
6115HWY_API void StoreInterleaved2(const Vec128<T, N> v0, const Vec128<T, N> v1,
6116 Simd<T, N, 0> /*tag*/,
6117 T* HWY_RESTRICT unaligned) {
6118 alignas(16) T buf[2 * 8 / sizeof(T)];
6119 detail::Tuple2<T, N> tup = {{{v0.raw, v1.raw}}};
6120 detail::StoreInterleaved2(tup, buf);
6121 CopyBytes<N * 2 * sizeof(T)>(buf, unaligned);
6122}
6123
6124#if HWY_ARCH_ARM_V7
6125// 64x2: split into two 64x1
6126template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6127HWY_API void StoreInterleaved2(const Vec128<T> v0, const Vec128<T> v1,
6128 Full128<T> d, T* HWY_RESTRICT unaligned) {
6129 const Half<decltype(d)> dh;
6130 StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh, unaligned);
6131 StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh, unaligned + 2);
6132}
6133#endif // HWY_ARCH_ARM_V7
6134
6135// ------------------------------ StoreInterleaved3
6136
6137template <typename T, size_t N, HWY_IF_STORE_INT(T, N)>
6139 const Vec128<T, N> v2, Simd<T, N, 0> /*tag*/,
6140 T* HWY_RESTRICT unaligned) {
6141 detail::Tuple3<T, N> tup = {{{v0.raw, v1.raw, v2.raw}}};
6142 detail::StoreInterleaved3(tup, unaligned);
6143}
6144
6145// <= 32 bits: avoid writing more than N bytes by copying to buffer
6146template <typename T, size_t N, HWY_IF_LE32(T, N)>
6147HWY_API void StoreInterleaved3(const Vec128<T, N> v0, const Vec128<T, N> v1,
6148 const Vec128<T, N> v2, Simd<T, N, 0> /*tag*/,
6149 T* HWY_RESTRICT unaligned) {
6150 alignas(16) T buf[3 * 8 / sizeof(T)];
6151 detail::Tuple3<T, N> tup = {{{v0.raw, v1.raw, v2.raw}}};
6152 detail::StoreInterleaved3(tup, buf);
6153 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
6154}
6155
6156#if HWY_ARCH_ARM_V7
6157// 64x2: split into two 64x1
6158template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6159HWY_API void StoreInterleaved3(const Vec128<T> v0, const Vec128<T> v1,
6160 const Vec128<T> v2, Full128<T> d,
6161 T* HWY_RESTRICT unaligned) {
6162 const Half<decltype(d)> dh;
6163 StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh,
6164 unaligned);
6165 StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh,
6166 unaligned + 3);
6167}
6168#endif // HWY_ARCH_ARM_V7
6169
6170// ------------------------------ StoreInterleaved4
6171
6172template <typename T, size_t N, HWY_IF_STORE_INT(T, N)>
6174 const Vec128<T, N> v2, const Vec128<T, N> v3,
6175 Simd<T, N, 0> /*tag*/,
6176 T* HWY_RESTRICT unaligned) {
6177 detail::Tuple4<T, N> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
6178 detail::StoreInterleaved4(tup, unaligned);
6179}
6180
6181// <= 32 bits: avoid writing more than N bytes by copying to buffer
6182template <typename T, size_t N, HWY_IF_LE32(T, N)>
6183HWY_API void StoreInterleaved4(const Vec128<T, N> v0, const Vec128<T, N> v1,
6184 const Vec128<T, N> v2, const Vec128<T, N> v3,
6185 Simd<T, N, 0> /*tag*/,
6186 T* HWY_RESTRICT unaligned) {
6187 alignas(16) T buf[4 * 8 / sizeof(T)];
6188 detail::Tuple4<T, N> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
6189 detail::StoreInterleaved4(tup, buf);
6190 CopyBytes<N * 4 * sizeof(T)>(buf, unaligned);
6191}
6192
6193#if HWY_ARCH_ARM_V7
6194// 64x2: split into two 64x1
6195template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6196HWY_API void StoreInterleaved4(const Vec128<T> v0, const Vec128<T> v1,
6197 const Vec128<T> v2, const Vec128<T> v3,
6198 Full128<T> d, T* HWY_RESTRICT unaligned) {
6199 const Half<decltype(d)> dh;
6200 StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2),
6201 LowerHalf(dh, v3), dh, unaligned);
6202 StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2),
6203 UpperHalf(dh, v3), dh, unaligned + 4);
6204}
6205#endif // HWY_ARCH_ARM_V7
6206
6207#undef HWY_IF_STORE_INT
6208
6209// ------------------------------ Lt128
6210
6211template <typename T, size_t N, HWY_IF_LE128(T, N)>
6213 Vec128<T, N> b) {
6214 static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
6215 // Truth table of Eq and Lt for Hi and Lo u64.
6216 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
6217 // =H =L cH cL | out = cH | (=H & cL)
6218 // 0 0 0 0 | 0
6219 // 0 0 0 1 | 0
6220 // 0 0 1 0 | 1
6221 // 0 0 1 1 | 1
6222 // 0 1 0 0 | 0
6223 // 0 1 0 1 | 0
6224 // 0 1 1 0 | 1
6225 // 1 0 0 0 | 0
6226 // 1 0 0 1 | 1
6227 // 1 1 0 0 | 0
6228 const Mask128<T, N> eqHL = Eq(a, b);
6229 const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
6230 // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
6231 // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
6232 // comparison result leftwards requires only 4. IfThenElse compiles to the
6233 // same code as OrAnd().
6234 const Vec128<T, N> ltLx = DupEven(ltHL);
6235 const Vec128<T, N> outHx = IfThenElse(eqHL, ltLx, ltHL);
6236 return MaskFromVec(DupOdd(outHx));
6237}
6238
6239template <typename T, size_t N, HWY_IF_LE128(T, N)>
6241 Vec128<T, N> b) {
6242 const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
6243 return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
6244}
6245
6246// ------------------------------ Min128, Max128 (Lt128)
6247
6248// Without a native OddEven, it seems infeasible to go faster than Lt128.
6249template <class D>
6251 return IfThenElse(Lt128(d, a, b), a, b);
6252}
6253
6254template <class D>
6256 return IfThenElse(Lt128(d, b, a), a, b);
6257}
6258
6259template <class D>
6261 return IfThenElse(Lt128Upper(d, a, b), a, b);
6262}
6263
6264template <class D>
6266 return IfThenElse(Lt128Upper(d, b, a), a, b);
6267}
6268
6269// ================================================== Operator wrapper
6270
6271// These apply to all x86_*-inl.h because there are no restrictions on V.
6272
6273template <class V>
6274HWY_API V Add(V a, V b) {
6275 return a + b;
6276}
6277template <class V>
6278HWY_API V Sub(V a, V b) {
6279 return a - b;
6280}
6281
6282template <class V>
6283HWY_API V Mul(V a, V b) {
6284 return a * b;
6285}
6286template <class V>
6287HWY_API V Div(V a, V b) {
6288 return a / b;
6289}
6290
6291template <class V>
6292V Shl(V a, V b) {
6293 return a << b;
6294}
6295template <class V>
6296V Shr(V a, V b) {
6297 return a >> b;
6298}
6299
6300template <class V>
6301HWY_API auto Eq(V a, V b) -> decltype(a == b) {
6302 return a == b;
6303}
6304template <class V>
6305HWY_API auto Ne(V a, V b) -> decltype(a == b) {
6306 return a != b;
6307}
6308template <class V>
6309HWY_API auto Lt(V a, V b) -> decltype(a == b) {
6310 return a < b;
6311}
6312
6313template <class V>
6314HWY_API auto Gt(V a, V b) -> decltype(a == b) {
6315 return a > b;
6316}
6317template <class V>
6318HWY_API auto Ge(V a, V b) -> decltype(a == b) {
6319 return a >= b;
6320}
6321
6322template <class V>
6323HWY_API auto Le(V a, V b) -> decltype(a == b) {
6324 return a <= b;
6325}
6326
6327namespace detail { // for code folding
6328#if HWY_ARCH_ARM_V7
6329#undef vuzp1_s8
6330#undef vuzp1_u8
6331#undef vuzp1_s16
6332#undef vuzp1_u16
6333#undef vuzp1_s32
6334#undef vuzp1_u32
6335#undef vuzp1_f32
6336#undef vuzp1q_s8
6337#undef vuzp1q_u8
6338#undef vuzp1q_s16
6339#undef vuzp1q_u16
6340#undef vuzp1q_s32
6341#undef vuzp1q_u32
6342#undef vuzp1q_f32
6343#undef vuzp2_s8
6344#undef vuzp2_u8
6345#undef vuzp2_s16
6346#undef vuzp2_u16
6347#undef vuzp2_s32
6348#undef vuzp2_u32
6349#undef vuzp2_f32
6350#undef vuzp2q_s8
6351#undef vuzp2q_u8
6352#undef vuzp2q_s16
6353#undef vuzp2q_u16
6354#undef vuzp2q_s32
6355#undef vuzp2q_u32
6356#undef vuzp2q_f32
6357#undef vzip1_s8
6358#undef vzip1_u8
6359#undef vzip1_s16
6360#undef vzip1_u16
6361#undef vzip1_s32
6362#undef vzip1_u32
6363#undef vzip1_f32
6364#undef vzip1q_s8
6365#undef vzip1q_u8
6366#undef vzip1q_s16
6367#undef vzip1q_u16
6368#undef vzip1q_s32
6369#undef vzip1q_u32
6370#undef vzip1q_f32
6371#undef vzip2_s8
6372#undef vzip2_u8
6373#undef vzip2_s16
6374#undef vzip2_u16
6375#undef vzip2_s32
6376#undef vzip2_u32
6377#undef vzip2_f32
6378#undef vzip2q_s8
6379#undef vzip2q_u8
6380#undef vzip2q_s16
6381#undef vzip2q_u16
6382#undef vzip2q_s32
6383#undef vzip2q_u32
6384#undef vzip2q_f32
6385#endif
6386
6387#undef HWY_NEON_BUILD_ARG_1
6388#undef HWY_NEON_BUILD_ARG_2
6389#undef HWY_NEON_BUILD_ARG_3
6390#undef HWY_NEON_BUILD_PARAM_1
6391#undef HWY_NEON_BUILD_PARAM_2
6392#undef HWY_NEON_BUILD_PARAM_3
6393#undef HWY_NEON_BUILD_RET_1
6394#undef HWY_NEON_BUILD_RET_2
6395#undef HWY_NEON_BUILD_RET_3
6396#undef HWY_NEON_BUILD_TPL_1
6397#undef HWY_NEON_BUILD_TPL_2
6398#undef HWY_NEON_BUILD_TPL_3
6399#undef HWY_NEON_DEF_FUNCTION
6400#undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
6401#undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
6402#undef HWY_NEON_DEF_FUNCTION_FLOAT_64
6403#undef HWY_NEON_DEF_FUNCTION_INTS
6404#undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
6405#undef HWY_NEON_DEF_FUNCTION_INT_16
6406#undef HWY_NEON_DEF_FUNCTION_INT_32
6407#undef HWY_NEON_DEF_FUNCTION_INT_8
6408#undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
6409#undef HWY_NEON_DEF_FUNCTION_TPL
6410#undef HWY_NEON_DEF_FUNCTION_UIF81632
6411#undef HWY_NEON_DEF_FUNCTION_UINTS
6412#undef HWY_NEON_DEF_FUNCTION_UINT_16
6413#undef HWY_NEON_DEF_FUNCTION_UINT_32
6414#undef HWY_NEON_DEF_FUNCTION_UINT_8
6415#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
6416#undef HWY_NEON_EVAL
6417} // namespace detail
6418
6419// NOLINTNEXTLINE(google-readability-namespace-comments)
6420} // namespace HWY_NAMESPACE
6421} // namespace hwy
HWY_AFTER_NAMESPACE()
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
Definition: arm_neon-inl.h:159
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:182
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)
Definition: arm_neon-inl.h:192
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:138
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:133
#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args)
Definition: arm_neon-inl.h:6076
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:91
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:187
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:121
HWY_BEFORE_NAMESPACE()
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:165
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)
Definition: arm_neon-inl.h:2385
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:107
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:114
#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args)
Definition: arm_neon-inl.h:196
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:177
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:99
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:127
#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args)
Definition: arm_neon-inl.h:5896
#define HWY_IF_FLOAT(T)
Definition: base.h:343
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_IF_LE64(T, N)
Definition: base.h:333
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
HWY_INLINE Mask128()
Definition: arm_neon-inl.h:809
Mask128(const Mask128 &)=default
Mask128 & operator=(const Mask128 &)=default
HWY_INLINE Mask128(const Raw raw)
Definition: arm_neon-inl.h:812
Raw raw
Definition: arm_neon-inl.h:814
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:806
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128()
Definition: arm_neon-inl.h:764
HWY_INLINE Vec128(const Raw raw)
Definition: arm_neon-inl.h:767
Vec128(const Vec128 &)=default
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: arm_neon-inl.h:774
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:761
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: arm_neon-inl.h:780
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: arm_neon-inl.h:789
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: arm_neon-inl.h:786
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: arm_neon-inl.h:771
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: arm_neon-inl.h:783
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: arm_neon-inl.h:777
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_INLINE Vec128< float > ReciprocalNewtonRaphsonStep(const Vec128< float > recip, const Vec128< float > divisor)
Definition: arm_neon-inl.h:1733
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2510
HWY_INLINE Vec128< uint8_t > Load8Bytes(Full128< uint8_t >, const uint8_t *bytes)
Definition: arm_neon-inl.h:5325
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3219
HWY_INLINE Vec128< T, N > Set64(Simd< T, N, 0 >, uint64_t mask_bits)
Definition: arm_neon-inl.h:4952
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5491
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition: wasm_128-inl.h:1856
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3208
HWY_INLINE uint64_t NibblesFromMask(const Full128< T > d, Mask128< T > mask)
Definition: arm_neon-inl.h:5019
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4283
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) template< size_t N > HWY_INLINE Vec128< uint8_t
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5751
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5207
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3345
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2039
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5339
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:5187
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1574
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2502
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE Vec128< float > ReciprocalSqrtStep(const Vec128< float > root, const Vec128< float > recip)
Definition: arm_neon-inl.h:1884
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition: arm_neon-inl.h:1388
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, 2 > ConcatEven(Simd< T, 2, 0 > d, Vec128< T, 2 > hi, Vec128< T, 2 > lo)
Definition: arm_neon-inl.h:4474
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Vec128< T, 2 > ConcatOdd(Simd< T, 2, 0 > d, Vec128< T, 2 > hi, Vec128< T, 2 > lo)
Definition: arm_neon-inl.h:4443
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
constexpr float MantissaEnd< float >()
Definition: base.h:636
double float64_t
Definition: base.h:258
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:309
float float32_t
Definition: base.h:257
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: arm_neon-inl.h:5318
Definition: arm_neon-inl.h:3883
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3884
Definition: ops/shared-inl.h:40
Definition: arm_neon-inl.h:823
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: arm_neon-inl.h:825
uint16x4_t type
Definition: arm_neon-inl.h:688
uint16x8_t type
Definition: arm_neon-inl.h:625
uint16x4_t type
Definition: arm_neon-inl.h:683
uint16x8_t type
Definition: arm_neon-inl.h:620
float32x2_t type
Definition: arm_neon-inl.h:693
float32x4_t type
Definition: arm_neon-inl.h:630
int16x4_t type
Definition: arm_neon-inl.h:668
int16x8_t type
Definition: arm_neon-inl.h:605
int32x2_t type
Definition: arm_neon-inl.h:673
int32x4_t type
Definition: arm_neon-inl.h:610
int64x1_t type
Definition: arm_neon-inl.h:678
int64x2_t type
Definition: arm_neon-inl.h:615
int8x16_t type
Definition: arm_neon-inl.h:600
int8x8_t type
Definition: arm_neon-inl.h:663
uint16x4_t type
Definition: arm_neon-inl.h:648
uint16x8_t type
Definition: arm_neon-inl.h:585
uint32x2_t type
Definition: arm_neon-inl.h:653
uint32x4_t type
Definition: arm_neon-inl.h:590
uint64x1_t type
Definition: arm_neon-inl.h:658
uint64x2_t type
Definition: arm_neon-inl.h:595
uint8x16_t type
Definition: arm_neon-inl.h:580
uint8x8_t type
Definition: arm_neon-inl.h:643
Definition: x86_128-inl.h:55
__v128_u type
Definition: wasm_128-inl.h:56
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3561
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3568
Definition: arm_neon-inl.h:3539
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3549
HWY_INLINE Vec128< T > operator()(const Vec128< T > v)
Definition: arm_neon-inl.h:3542
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3591
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3598
Definition: arm_neon-inl.h:3574
HWY_INLINE Vec128< T, N > operator()(Vec128< T, N > v)
Definition: arm_neon-inl.h:3576
uint16x8x2_t raw
Definition: arm_neon-inl.h:346
uint16x4x2_t raw
Definition: arm_neon-inl.h:350
uint16x8x2_t raw
Definition: arm_neon-inl.h:338
uint16x4x2_t raw
Definition: arm_neon-inl.h:342
float32x4x2_t raw
Definition: arm_neon-inl.h:355
float32x2x2_t raw
Definition: arm_neon-inl.h:359
int16x8x2_t raw
Definition: arm_neon-inl.h:297
int16x4x2_t raw
Definition: arm_neon-inl.h:301
int32x4x2_t raw
Definition: arm_neon-inl.h:313
int32x2x2_t raw
Definition: arm_neon-inl.h:317
int64x2x2_t raw
Definition: arm_neon-inl.h:329
int64x1x2_t raw
Definition: arm_neon-inl.h:333
int8x16x2_t raw
Definition: arm_neon-inl.h:281
int8x8x2_t raw
Definition: arm_neon-inl.h:285
uint16x8x2_t raw
Definition: arm_neon-inl.h:289
uint16x4x2_t raw
Definition: arm_neon-inl.h:293
uint32x4x2_t raw
Definition: arm_neon-inl.h:305
uint32x2x2_t raw
Definition: arm_neon-inl.h:309
uint64x2x2_t raw
Definition: arm_neon-inl.h:321
uint64x1x2_t raw
Definition: arm_neon-inl.h:325
uint8x16x2_t raw
Definition: arm_neon-inl.h:273
uint8x8x2_t raw
Definition: arm_neon-inl.h:277
Definition: arm_neon-inl.h:265
uint16x8x3_t raw
Definition: arm_neon-inl.h:447
uint16x4x3_t raw
Definition: arm_neon-inl.h:451
uint16x8x3_t raw
Definition: arm_neon-inl.h:439
uint16x4x3_t raw
Definition: arm_neon-inl.h:443
float32x4x3_t raw
Definition: arm_neon-inl.h:456
float32x2x3_t raw
Definition: arm_neon-inl.h:460
int16x8x3_t raw
Definition: arm_neon-inl.h:398
int16x4x3_t raw
Definition: arm_neon-inl.h:402
int32x4x3_t raw
Definition: arm_neon-inl.h:414
int32x2x3_t raw
Definition: arm_neon-inl.h:418
int64x2x3_t raw
Definition: arm_neon-inl.h:430
int64x1x3_t raw
Definition: arm_neon-inl.h:434
int8x16x3_t raw
Definition: arm_neon-inl.h:382
int8x8x3_t raw
Definition: arm_neon-inl.h:386
uint16x8x3_t raw
Definition: arm_neon-inl.h:390
uint16x4x3_t raw
Definition: arm_neon-inl.h:394
uint32x4x3_t raw
Definition: arm_neon-inl.h:406
uint32x2x3_t raw
Definition: arm_neon-inl.h:410
uint64x2x3_t raw
Definition: arm_neon-inl.h:422
uint64x1x3_t raw
Definition: arm_neon-inl.h:426
uint8x16x3_t raw
Definition: arm_neon-inl.h:374
uint8x8x3_t raw
Definition: arm_neon-inl.h:378
Definition: arm_neon-inl.h:267
uint16x8x4_t raw
Definition: arm_neon-inl.h:548
uint16x4x4_t raw
Definition: arm_neon-inl.h:552
uint16x8x4_t raw
Definition: arm_neon-inl.h:540
uint16x4x4_t raw
Definition: arm_neon-inl.h:544
float32x4x4_t raw
Definition: arm_neon-inl.h:557
float32x2x4_t raw
Definition: arm_neon-inl.h:561
int16x8x4_t raw
Definition: arm_neon-inl.h:499
int16x4x4_t raw
Definition: arm_neon-inl.h:503
int32x4x4_t raw
Definition: arm_neon-inl.h:515
int32x2x4_t raw
Definition: arm_neon-inl.h:519
int64x2x4_t raw
Definition: arm_neon-inl.h:531
int64x1x4_t raw
Definition: arm_neon-inl.h:535
int8x16x4_t raw
Definition: arm_neon-inl.h:483
int8x8x4_t raw
Definition: arm_neon-inl.h:487
uint16x8x4_t raw
Definition: arm_neon-inl.h:491
uint16x4x4_t raw
Definition: arm_neon-inl.h:495
uint32x4x4_t raw
Definition: arm_neon-inl.h:507
uint32x2x4_t raw
Definition: arm_neon-inl.h:511
uint64x2x4_t raw
Definition: arm_neon-inl.h:523
uint64x1x4_t raw
Definition: arm_neon-inl.h:527
uint8x16x4_t raw
Definition: arm_neon-inl.h:475
uint8x8x4_t raw
Definition: arm_neon-inl.h:479
Definition: arm_neon-inl.h:269
Definition: base.h:358
Definition: base.h:251
Definition: base.h:246