Grok 10.0.3
rvv-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// RISC-V V vectors (length not known at compile time).
17// External include guard in highway.h - see comment there.
18
19#include <riscv_vector.h>
20#include <stddef.h>
21#include <stdint.h>
22
23#include "hwy/base.h"
24#include "hwy/ops/shared-inl.h"
25
27namespace hwy {
28namespace HWY_NAMESPACE {
29
30template <class V>
31struct DFromV_t {}; // specialized in macros
32template <class V>
33using DFromV = typename DFromV_t<RemoveConst<V>>::type;
34
35template <class V>
36using TFromV = TFromD<DFromV<V>>;
37
38// Enables the overload if Pow2 is in [min, max].
39#define HWY_RVV_IF_POW2_IN(D, min, max) \
40 hwy::EnableIf<(min) <= Pow2(D()) && Pow2(D()) <= (max)>* = nullptr
41
42template <typename T, size_t N, int kPow2>
43constexpr size_t MLenFromD(Simd<T, N, kPow2> /* tag */) {
44 // Returns divisor = type bits / LMUL. Folding *8 into the ScaleByPower
45 // argument enables fractional LMUL < 1. Limit to 64 because that is the
46 // largest value for which vbool##_t are defined.
47 return HWY_MIN(64, sizeof(T) * 8 * 8 / detail::ScaleByPower(8, kPow2));
48}
49
50// ================================================== MACROS
51
52// Generate specializations and function definitions using X macros. Although
53// harder to read and debug, writing everything manually is too bulky.
54
55namespace detail { // for code folding
56
57// For all mask sizes MLEN: (1/Nth of a register, one bit per lane)
58// The first two arguments are SEW and SHIFT such that SEW >> SHIFT = MLEN.
59#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \
60 X_MACRO(64, 0, 64, NAME, OP) \
61 X_MACRO(32, 0, 32, NAME, OP) \
62 X_MACRO(16, 0, 16, NAME, OP) \
63 X_MACRO(8, 0, 8, NAME, OP) \
64 X_MACRO(8, 1, 4, NAME, OP) \
65 X_MACRO(8, 2, 2, NAME, OP) \
66 X_MACRO(8, 3, 1, NAME, OP)
67
68// For given SEW, iterate over one of LMULS: _TRUNC, _EXT, _ALL. This allows
69// reusing type lists such as HWY_RVV_FOREACH_U for _ALL (the usual case) or
70// _EXT (for Combine). To achieve this, we HWY_CONCAT with the LMULS suffix.
71//
72// Precompute SEW/LMUL => MLEN to allow token-pasting the result. For the same
73// reason, also pass the double-width and half SEW and LMUL (suffixed D and H,
74// respectively). "__" means there is no corresponding LMUL (e.g. LMULD for m8).
75// Args: BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP
76
77// LMULS = _TRUNC: truncatable (not the smallest LMUL)
78#define HWY_RVV_FOREACH_08_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
79 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
80 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
81 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
82 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
83 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \
84 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
85
86#define HWY_RVV_FOREACH_16_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
87 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
88 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
89 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
90 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \
91 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
92
93#define HWY_RVV_FOREACH_32_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
94 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
95 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
96 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \
97 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
98
99#define HWY_RVV_FOREACH_64_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
100 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
101 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
102 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
103
104// LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH.
105#define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
106 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
107 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
108 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
109 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
110 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \
111 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
112
113#define HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
114 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
115 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
116 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
117 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
118 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \
119 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
120
121#define HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
122 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
123 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
124 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
125 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \
126 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
127
128#define HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
129 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
130 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
131 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
132 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
133
134// LMULS = _LE2: <= 2
135#define HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
136 X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP) \
137 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
138 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
139 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
140 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP)
141
142#define HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
143 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
144 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
145 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
146 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP)
147
148#define HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
149 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
150 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
151 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP)
152
153#define HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
154 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
155 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP)
156
157// LMULS = _EXT: not the largest LMUL
158#define HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
159 HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
160 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP)
161
162#define HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
163 HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
164 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP)
165
166#define HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
167 HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
168 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP)
169
170#define HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
171 HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
172 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP)
173
174// LMULS = _ALL (2^MinPow2() <= LMUL <= 8)
175#define HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
176 HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
177 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
178
179#define HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
180 HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
181 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
182
183#define HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
184 HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
185 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
186
187#define HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
188 HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
189 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
190
191// 'Virtual' LMUL. This upholds the Highway guarantee that vectors are at least
192// 128 bit and LowerHalf is defined whenever there are at least 2 lanes, even
193// though RISC-V LMUL must be at least SEW/64 (notice that this rules out
194// LMUL=1/2 for SEW=64). To bridge the gap, we add overloads for kPow2 equal to
195// one less than should be supported, with all other parameters (vector type
196// etc.) unchanged. For D with the lowest kPow2 ('virtual LMUL'), Lanes()
197// returns half of what it usually would.
198//
199// Notice that we can only add overloads whenever there is a D argument: those
200// are unique with respect to non-virtual-LMUL overloads because their kPow2
201// template argument differs. Otherwise, there is no actual vuint64mf2_t, and
202// defining another overload with the same LMUL would be an error. Thus we have
203// a separate _VIRT category for HWY_RVV_FOREACH*, and the common case is
204// _ALL_VIRT (meaning the regular LMUL plus the VIRT overloads), used in most
205// functions that take a D.
206
207#define HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
208
209#define HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
210 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -3, /*MLEN=*/64, NAME, OP)
211
212#define HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
213 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -2, /*MLEN=*/64, NAME, OP)
214
215#define HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
216 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, -1, /*MLEN=*/64, NAME, OP)
217
218// ALL + VIRT
219#define HWY_RVV_FOREACH_08_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
220 HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
221 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
222
223#define HWY_RVV_FOREACH_16_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
224 HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
225 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
226
227#define HWY_RVV_FOREACH_32_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
228 HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
229 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
230
231#define HWY_RVV_FOREACH_64_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
232 HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
233 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
234
235// LE2 + VIRT
236#define HWY_RVV_FOREACH_08_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
237 HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
238 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
239
240#define HWY_RVV_FOREACH_16_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
241 HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
242 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
243
244#define HWY_RVV_FOREACH_32_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
245 HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
246 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
247
248#define HWY_RVV_FOREACH_64_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
249 HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
250 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
251
252// EXT + VIRT
253#define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
254 HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
255 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
256
257#define HWY_RVV_FOREACH_16_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
258 HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
259 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
260
261#define HWY_RVV_FOREACH_32_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
262 HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
263 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
264
265#define HWY_RVV_FOREACH_64_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
266 HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
267 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
268
269// DEMOTE + VIRT
270#define HWY_RVV_FOREACH_08_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
271 HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
272 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
273
274#define HWY_RVV_FOREACH_16_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
275 HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
276 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
277
278#define HWY_RVV_FOREACH_32_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
279 HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
280 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
281
282#define HWY_RVV_FOREACH_64_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
283 HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
284 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
285
286// SEW for unsigned:
287#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
288 HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, uint, u, NAME, OP)
289#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
290 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, uint, u, NAME, OP)
291#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
292 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, uint, u, NAME, OP)
293#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
294 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, uint, u, NAME, OP)
295
296// SEW for signed:
297#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
298 HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, int, i, NAME, OP)
299#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
300 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, int, i, NAME, OP)
301#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
302 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, int, i, NAME, OP)
303#define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) \
304 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, int, i, NAME, OP)
305
306// SEW for float:
307#if HWY_HAVE_FLOAT16
308#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
309 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, float, f, NAME, OP)
310#else
311#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
312#endif
313#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
314 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
315#define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) \
316 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, float, f, NAME, OP)
317
318// Commonly used type/SEW groups:
319#define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS) \
320 HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
321 HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
322
323#define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS) \
324 HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
325 HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
326
327#define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
328 HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
329 HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
330
331#define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) \
332 HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
333 HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
334
335#define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS) \
336 HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
337 HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS)
338
339#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
340 HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
341 HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
342 HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
343
344#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) \
345 HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
346 HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
347 HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
348
349#define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS) \
350 HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
351 HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
352
353#define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) \
354 HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
355 HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS)
356
357// For all combinations of SEW:
358#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
359 HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
360 HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
361 HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
362 HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
363
364#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
365 HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
366 HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
367 HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
368 HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
369
370#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \
371 HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
372 HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS)
373
374// Commonly used type categories:
375#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
376 HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
377 HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
378
379#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \
380 HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
381 HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
382 HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
383
384// Assemble types for use in x-macros
385#define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
386#define HWY_RVV_D(BASE, SEW, N, SHIFT) Simd<HWY_RVV_T(BASE, SEW), N, SHIFT>
387#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
388#define HWY_RVV_M(MLEN) vbool##MLEN##_t
389
390} // namespace detail
391
392// Until we have full intrinsic support for fractional LMUL, mixed-precision
393// code can use LMUL 1..8 (adequate unless they need many registers).
394#define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
395 MLEN, NAME, OP) \
396 template <> \
397 struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \
398 using Lane = HWY_RVV_T(BASE, SEW); \
399 using type = ScalableTag<Lane, SHIFT>; \
400 };
401
403#undef HWY_SPECIALIZE
404
405// ------------------------------ Lanes
406
407// WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL!
408// vlenb is not exposed through intrinsics and vreadvl is not VLMAX.
409#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
410 MLEN, NAME, OP) \
411 template <size_t N> \
412 HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
413 size_t actual = v##OP##SEW##LMUL(); \
414 /* Common case of full vectors: avoid any extra instructions. */ \
415 /* actual includes LMUL, so do not shift again. */ \
416 if (detail::IsFull(d)) return actual; \
417 /* Check for virtual LMUL, e.g. "uint16mf8_t" (not provided by */ \
418 /* intrinsics). In this case the actual LMUL is 1/4, so divide by */ \
419 /* another factor of two. */ \
420 if (detail::ScaleByPower(128 / SEW, SHIFT) == 1) actual >>= 1; \
421 return HWY_MIN(actual, N); \
422 }
423
424HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL_VIRT)
425#undef HWY_RVV_LANES
426
427template <size_t N, int kPow2>
430}
431
432// ------------------------------ Common x-macros
433
434// Last argument to most intrinsics. Use when the op has no d arg of its own,
435// which means there is no user-specified cap.
436#define HWY_RVV_AVL(SEW, SHIFT) \
437 Lanes(ScalableTag<HWY_RVV_T(uint, SEW), SHIFT>())
438
439// vector = f(vector), e.g. Not
440#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
441 SHIFT, MLEN, NAME, OP) \
442 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
443 return v##OP##_v_##CHAR##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
444 }
445
446// vector = f(vector, scalar), e.g. detail::AddS
447#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
448 SHIFT, MLEN, NAME, OP) \
449 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
450 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
451 return v##OP##_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
452 }
453
454// vector = f(vector, vector), e.g. Add
455#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
456 SHIFT, MLEN, NAME, OP) \
457 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
458 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
459 return v##OP##_vv_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
460 }
461
462// mask = f(mask)
463#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
464 HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
465 return vm##OP##_m_b##MLEN(m, ~0ull); \
466 }
467
468// ================================================== INIT
469
470// ------------------------------ Set
471
472#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
473 MLEN, NAME, OP) \
474 template <size_t N> \
475 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
476 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_T(BASE, SEW) arg) { \
477 return v##OP##_##CHAR##SEW##LMUL(arg, Lanes(d)); \
478 }
479
480HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x, _ALL_VIRT)
481HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT)
482#undef HWY_RVV_SET
483
484// Treat bfloat16_t as uint16_t (using the previously defined Set overloads);
485// required for Zero and VFromD.
486template <size_t N, int kPow2>
488 bfloat16_t arg) {
489 return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
490}
491
492template <class D>
493using VFromD = decltype(Set(D(), TFromD<D>()));
494
495// ------------------------------ Zero
496
497template <typename T, size_t N, int kPow2>
499 return Set(d, T(0));
500}
501
502// ------------------------------ Undefined
503
504// RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
505// by it gives unpredictable results. It should only be used for maskoff, so
506// keep it internal. For the Highway op, just use Zero (single instruction).
507namespace detail {
508#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
509 SHIFT, MLEN, NAME, OP) \
510 template <size_t N> \
511 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
512 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) /* tag */) { \
513 return v##OP##_##CHAR##SEW##LMUL(); /* no AVL */ \
514 }
515
517#undef HWY_RVV_UNDEFINED
518} // namespace detail
519
520template <class D>
522 return Zero(d);
523}
524
525// ------------------------------ BitCast
526
527namespace detail {
528
529// Halves LMUL. (Use LMUL arg for the source so we can use _TRUNC.)
530#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
531 MLEN, NAME, OP) \
532 HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
533 return v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH(v); /* no AVL */ \
534 }
535HWY_RVV_FOREACH(HWY_RVV_TRUNC, Trunc, lmul_trunc, _TRUNC)
536#undef HWY_RVV_TRUNC
537
538// Doubles LMUL to `d2` (the arg is only necessary for _VIRT).
539#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
540 MLEN, NAME, OP) \
541 template <size_t N> \
542 HWY_API HWY_RVV_V(BASE, SEW, LMULD) \
543 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \
544 HWY_RVV_V(BASE, SEW, LMUL) v) { \
545 return v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULD(v); /* no AVL */ \
546 }
547HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT)
548#undef HWY_RVV_EXT
549
550// For virtual LMUL e.g. 'uint32mf4_t', the return type should be mf2, which is
551// the same as the actual input type.
552#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
553 SHIFT, MLEN, NAME, OP) \
554 template <size_t N> \
555 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
556 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \
557 HWY_RVV_V(BASE, SEW, LMUL) v) { \
558 return v; \
559 }
560HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT)
561#undef HWY_RVV_EXT_VIRT
562
563// For BitCastToByte, the D arg is only to prevent duplicate definitions caused
564// by _ALL_VIRT.
565
566// There is no reinterpret from u8 <-> u8, so just return.
567#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
568 SHIFT, MLEN, NAME, OP) \
569 template <typename T, size_t N> \
570 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
571 vuint8##LMUL##_t v) { \
572 return v; \
573 } \
574 template <size_t N> \
575 HWY_API vuint8##LMUL##_t BitCastFromByte( \
576 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
577 return v; \
578 }
579
580// For i8, need a single reinterpret (HWY_RVV_CAST_IF does two).
581#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
582 SHIFT, MLEN, NAME, OP) \
583 template <typename T, size_t N> \
584 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
585 vint8##LMUL##_t v) { \
586 return vreinterpret_v_i8##LMUL##_u8##LMUL(v); \
587 } \
588 template <size_t N> \
589 HWY_API vint8##LMUL##_t BitCastFromByte( \
590 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
591 return vreinterpret_v_u8##LMUL##_i8##LMUL(v); \
592 }
593
594// Separate u/i because clang only provides signed <-> unsigned reinterpret for
595// the same SEW.
596#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
597 MLEN, NAME, OP) \
598 template <typename T, size_t N> \
599 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
600 HWY_RVV_V(BASE, SEW, LMUL) v) { \
601 return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \
602 } \
603 template <size_t N> \
604 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
605 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
606 return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \
607 }
608
609// Signed/Float: first cast to/from unsigned
610#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
611 SHIFT, MLEN, NAME, OP) \
612 template <typename T, size_t N> \
613 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
614 HWY_RVV_V(BASE, SEW, LMUL) v) { \
615 return v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
616 v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v)); \
617 } \
618 template <size_t N> \
619 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
620 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
621 return v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \
622 v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \
623 }
624
625// Additional versions for virtual LMUL using LMULH for byte vectors.
626#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
627 SHIFT, MLEN, NAME, OP) \
628 template <typename T, size_t N> \
629 HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
630 HWY_RVV_V(BASE, SEW, LMUL) v) { \
631 return detail::Trunc(v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v)); \
632 } \
633 template <size_t N> \
634 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
635 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \
636 HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
637 const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \
638 return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v2); \
639 }
640
641// Signed/Float: first cast to/from unsigned
642#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
643 SHIFT, MLEN, NAME, OP) \
644 template <typename T, size_t N> \
645 HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
646 HWY_RVV_V(BASE, SEW, LMUL) v) { \
647 return detail::Trunc(v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
648 v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v))); \
649 } \
650 template <size_t N> \
651 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
652 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \
653 HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
654 const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \
655 return v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \
656 v##OP##_v_u8##LMUL##_u##SEW##LMUL(v2)); \
657 }
658
659HWY_RVV_FOREACH_U08(HWY_RVV_CAST_U8, _, reinterpret, _ALL)
660HWY_RVV_FOREACH_I08(HWY_RVV_CAST_I8, _, reinterpret, _ALL)
661HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_U, _, reinterpret, _ALL)
662HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
663HWY_RVV_FOREACH_F(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
664HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_VIRT_U, _, reinterpret, _VIRT)
665HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
666HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
667
668#undef HWY_RVV_CAST_U8
669#undef HWY_RVV_CAST_I8
670#undef HWY_RVV_CAST_U
671#undef HWY_RVV_CAST_IF
672#undef HWY_RVV_CAST_VIRT_U
673#undef HWY_RVV_CAST_VIRT_IF
674
675template <size_t N, int kPow2>
679}
680
681} // namespace detail
682
683template <class D, class FromV>
684HWY_API VFromD<D> BitCast(D d, FromV v) {
686}
687
688namespace detail {
689
690template <class V, class DU = RebindToUnsigned<DFromV<V>>>
692 return BitCast(DU(), v);
693}
694
695} // namespace detail
696
697// ------------------------------ Iota
698
699namespace detail {
700
701#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
702 MLEN, NAME, OP) \
703 template <size_t N> \
704 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
705 return v##OP##_##CHAR##SEW##LMUL(Lanes(d)); \
706 }
707
708HWY_RVV_FOREACH_U(HWY_RVV_IOTA, Iota0, id_v, _ALL_VIRT)
709#undef HWY_RVV_IOTA
710
711template <class D, class DU = RebindToUnsigned<D>>
713 return BitCastToUnsigned(Iota0(DU()));
714}
715
716} // namespace detail
717
718// ================================================== LOGICAL
719
720// ------------------------------ Not
721
723
724template <class V, HWY_IF_FLOAT_V(V)>
725HWY_API V Not(const V v) {
726 using DF = DFromV<V>;
727 using DU = RebindToUnsigned<DF>;
728 return BitCast(DF(), Not(BitCast(DU(), v)));
729}
730
731// ------------------------------ And
732
733// Non-vector version (ideally immediate) for use with Iota0
734namespace detail {
735HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AndS, and_vx, _ALL)
736} // namespace detail
737
739
740template <class V, HWY_IF_FLOAT_V(V)>
741HWY_API V And(const V a, const V b) {
742 using DF = DFromV<V>;
743 using DU = RebindToUnsigned<DF>;
744 return BitCast(DF(), And(BitCast(DU(), a), BitCast(DU(), b)));
745}
746
747// ------------------------------ Or
748
750
751template <class V, HWY_IF_FLOAT_V(V)>
752HWY_API V Or(const V a, const V b) {
753 using DF = DFromV<V>;
754 using DU = RebindToUnsigned<DF>;
755 return BitCast(DF(), Or(BitCast(DU(), a), BitCast(DU(), b)));
756}
757
758// ------------------------------ Xor
759
760// Non-vector version (ideally immediate) for use with Iota0
761namespace detail {
762HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, XorS, xor_vx, _ALL)
763} // namespace detail
764
766
767template <class V, HWY_IF_FLOAT_V(V)>
768HWY_API V Xor(const V a, const V b) {
769 using DF = DFromV<V>;
770 using DU = RebindToUnsigned<DF>;
771 return BitCast(DF(), Xor(BitCast(DU(), a), BitCast(DU(), b)));
772}
773
774// ------------------------------ AndNot
775
776template <class V>
777HWY_API V AndNot(const V not_a, const V b) {
778 return And(Not(not_a), b);
779}
780
781// ------------------------------ Or3
782
783template <class V>
784HWY_API V Or3(V o1, V o2, V o3) {
785 return Or(o1, Or(o2, o3));
786}
787
788// ------------------------------ OrAnd
789
790template <class V>
791HWY_API V OrAnd(const V o, const V a1, const V a2) {
792 return Or(o, And(a1, a2));
793}
794
795// ------------------------------ CopySign
796
798
799template <class V>
800HWY_API V CopySignToAbs(const V abs, const V sign) {
801 // RVV can also handle abs < 0, so no extra action needed.
802 return CopySign(abs, sign);
803}
804
805// ================================================== ARITHMETIC
806
807// ------------------------------ Add
808
809namespace detail {
810HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx, _ALL)
811HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf, _ALL)
812HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx, _ALL)
813HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf, _ALL)
814} // namespace detail
815
818
819// ------------------------------ Sub
822
823// ------------------------------ SaturatedAdd
824
827
830
831// ------------------------------ SaturatedSub
832
835
838
839// ------------------------------ AverageRound
840
841// TODO(janwas): check vxrm rounding mode
844
845// ------------------------------ ShiftLeft[Same]
846
847// Intrinsics do not define .vi forms, so use .vx instead.
848#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
849 MLEN, NAME, OP) \
850 template <int kBits> \
851 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
852 return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits, HWY_RVV_AVL(SEW, SHIFT)); \
853 } \
854 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
855 NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \
856 return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits), \
857 HWY_RVV_AVL(SEW, SHIFT)); \
858 }
859
861
862// ------------------------------ ShiftRight[Same]
863
866
867#undef HWY_RVV_SHIFT
868
869// ------------------------------ SumsOf8 (ShiftRight, Add)
870template <class VU8>
872 const DFromV<VU8> du8;
873 const RepartitionToWide<decltype(du8)> du16;
874 const RepartitionToWide<decltype(du16)> du32;
875 const RepartitionToWide<decltype(du32)> du64;
876 using VU16 = VFromD<decltype(du16)>;
877
878 const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
879 const VU16 vECA86420 = detail::AndS(BitCast(du16, v), 0xFF);
880 const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
881
882 const VU16 szz_FE_zz_BA_zz_76_zz_32 =
883 BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
884 const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
885 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
886 const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
887 BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
888 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
889 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
890 return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull);
891}
892
893// ------------------------------ RotateRight
894template <int kBits, class V>
895HWY_API V RotateRight(const V v) {
896 constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
897 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
898 if (kBits == 0) return v;
899 return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
900}
901
902// ------------------------------ Shl
903#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
904 SHIFT, MLEN, NAME, OP) \
905 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
906 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
907 return v##OP##_vv_##CHAR##SEW##LMUL(v, bits, HWY_RVV_AVL(SEW, SHIFT)); \
908 }
909
911
912#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
913 SHIFT, MLEN, NAME, OP) \
914 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
915 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
916 return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits), \
917 HWY_RVV_AVL(SEW, SHIFT)); \
918 }
919
921
922// ------------------------------ Shr
923
926
927#undef HWY_RVV_SHIFT_II
928#undef HWY_RVV_SHIFT_VV
929
930// ------------------------------ Min
931
935
936// ------------------------------ Max
937
938namespace detail {
939
940HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MaxS, maxu_vx, _ALL)
941HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MaxS, max_vx, _ALL)
942HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MaxS, fmax_vf, _ALL)
943
944} // namespace detail
945
949
950// ------------------------------ Mul
951
952// Only for internal use (Highway only promises Mul for 16/32-bit inputs).
953// Used by MulLower.
954namespace detail {
956} // namespace detail
957
961
962// ------------------------------ MulHigh
963
964// Only for internal use (Highway only promises MulHigh for 16-bit inputs).
965// Used by MulEven; vwmul does not work for m8.
966namespace detail {
970} // namespace detail
971
974
975// ------------------------------ MulFixedPoint15
977
978// ------------------------------ Div
980
981// ------------------------------ ApproximateReciprocal
983
984// ------------------------------ Sqrt
986
987// ------------------------------ ApproximateReciprocalSqrt
989
990// ------------------------------ MulAdd
991// Note: op is still named vv, not vvv.
992#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
993 MLEN, NAME, OP) \
994 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
995 NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \
996 HWY_RVV_V(BASE, SEW, LMUL) add) { \
997 return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x, HWY_RVV_AVL(SEW, SHIFT)); \
998 }
999
1001
1002// ------------------------------ NegMulAdd
1004
1005// ------------------------------ MulSub
1007
1008// ------------------------------ NegMulSub
1010
1011#undef HWY_RVV_FMA
1012
1013// ================================================== COMPARE
1014
1015// Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in
1016// vboolXX_t is a power of two divisor for vector bits. SLEN 8 / LMUL 1 = 1/8th
1017// of all bits; SLEN 8 / LMUL 4 = half of all bits.
1018
1019// mask = f(vector, vector)
1020#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1021 SHIFT, MLEN, NAME, OP) \
1022 HWY_API HWY_RVV_M(MLEN) \
1023 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
1024 return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b, \
1025 HWY_RVV_AVL(SEW, SHIFT)); \
1026 }
1027
1028// mask = f(vector, scalar)
1029#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1030 SHIFT, MLEN, NAME, OP) \
1031 HWY_API HWY_RVV_M(MLEN) \
1032 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
1033 return v##OP##_##CHAR##SEW##LMUL##_b##MLEN(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
1034 }
1035
1036// ------------------------------ Eq
1039
1040namespace detail {
1041HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL)
1042HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL)
1043} // namespace detail
1044
1045// ------------------------------ Ne
1048
1049namespace detail {
1050HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL)
1051HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL)
1052} // namespace detail
1053
1054// ------------------------------ Lt
1058
1059namespace detail {
1060HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL)
1061HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVS, LtS, msltu_vx, _ALL)
1062HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL)
1063} // namespace detail
1064
1065// ------------------------------ Le
1067
1068#undef HWY_RVV_RETM_ARGVV
1069#undef HWY_RVV_RETM_ARGVS
1070
1071// ------------------------------ Gt/Ge
1072
1073template <class V>
1074HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) {
1075 return Le(b, a);
1076}
1077
1078template <class V>
1079HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) {
1080 return Lt(b, a);
1081}
1082
1083// ------------------------------ TestBit
1084template <class V>
1085HWY_API auto TestBit(const V a, const V bit) -> decltype(Eq(a, bit)) {
1086 return detail::NeS(And(a, bit), 0);
1087}
1088
1089// ------------------------------ Not
1091
1092
1093// ------------------------------ And
1094
1095// mask = f(mask_a, mask_b) (note arg2,arg1 order!)
1096#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP) \
1097 HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \
1098 return vm##OP##_mm_b##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT)); \
1099 }
1100
1102
1103// ------------------------------ AndNot
1105
1106// ------------------------------ Or
1108
1109// ------------------------------ Xor
1111
1112#undef HWY_RVV_RETM_ARGMM
1113
1114// ------------------------------ IfThenElse
1115#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1116 SHIFT, MLEN, NAME, OP) \
1117 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1118 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \
1119 HWY_RVV_V(BASE, SEW, LMUL) no) { \
1120 return v##OP##_vvm_##CHAR##SEW##LMUL(m, no, yes, HWY_RVV_AVL(SEW, SHIFT)); \
1121 }
1122
1124
1125#undef HWY_RVV_IF_THEN_ELSE
1126
1127// ------------------------------ IfThenElseZero
1128template <class M, class V>
1129HWY_API V IfThenElseZero(const M mask, const V yes) {
1130 return IfThenElse(mask, yes, Zero(DFromV<V>()));
1131}
1132
1133// ------------------------------ IfThenZeroElse
1134
1135#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
1136 LMULH, SHIFT, MLEN, NAME, OP) \
1137 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1138 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) no) { \
1139 return v##OP##_##CHAR##SEW##LMUL(m, no, 0, HWY_RVV_AVL(SEW, SHIFT)); \
1140 }
1141
1144
1145#undef HWY_RVV_IF_THEN_ZERO_ELSE
1146
1147// ------------------------------ MaskFromVec
1148
1149template <class V>
1150HWY_API auto MaskFromVec(const V v) -> decltype(Eq(v, v)) {
1151 return detail::NeS(v, 0);
1152}
1153
1154template <class D>
1155using MFromD = decltype(MaskFromVec(Zero(D())));
1156
1157template <class D, typename MFrom>
1158HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
1159 // No need to check lane size/LMUL are the same: if not, casting MFrom to
1160 // MFromD<D> would fail.
1161 return mask;
1162}
1163
1164// ------------------------------ VecFromMask
1165
1166namespace detail {
1167#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1168 SHIFT, MLEN, NAME, OP) \
1169 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1170 NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_M(MLEN) m) { \
1171 return v##OP##_##CHAR##SEW##LMUL##_m(m, v0, v0, 1, \
1172 HWY_RVV_AVL(SEW, SHIFT)); \
1173 }
1174
1175HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, SubS, sub_vx, _ALL)
1176#undef HWY_RVV_VEC_FROM_MASK
1177} // namespace detail
1178
1179template <class D, HWY_IF_NOT_FLOAT_D(D)>
1181 return detail::SubS(Zero(d), mask);
1182}
1183
1184template <class D, HWY_IF_FLOAT_D(D)>
1185HWY_API VFromD<D> VecFromMask(const D d, MFromD<D> mask) {
1186 return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask));
1187}
1188
1189// ------------------------------ IfVecThenElse (MaskFromVec)
1190
1191template <class V>
1192HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
1193 return IfThenElse(MaskFromVec(mask), yes, no);
1194}
1195
1196// ------------------------------ ZeroIfNegative
1197template <class V>
1198HWY_API V ZeroIfNegative(const V v) {
1199 return IfThenZeroElse(detail::LtS(v, 0), v);
1200}
1201
1202// ------------------------------ BroadcastSignBit
1203template <class V>
1204HWY_API V BroadcastSignBit(const V v) {
1205 return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
1206}
1207
1208// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
1209template <class V>
1210HWY_API V IfNegativeThenElse(V v, V yes, V no) {
1211 static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
1212 const DFromV<V> d;
1213 const RebindToSigned<decltype(d)> di;
1214
1215 MFromD<decltype(d)> m =
1217 return IfThenElse(m, yes, no);
1218}
1219
1220// ------------------------------ FindFirstTrue
1221
1222#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1223 template <class D> \
1224 HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
1225 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1226 return vfirst_m_b##MLEN(m, Lanes(d)); \
1227 }
1228
1230#undef HWY_RVV_FIND_FIRST_TRUE
1231
1232// ------------------------------ AllFalse
1233template <class D>
1235 return FindFirstTrue(d, m) < 0;
1236}
1237
1238// ------------------------------ AllTrue
1239
1240#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1241 template <class D> \
1242 HWY_API bool AllTrue(D d, HWY_RVV_M(MLEN) m) { \
1243 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1244 return AllFalse(d, vmnot_m_b##MLEN(m, Lanes(d))); \
1245 }
1246
1248#undef HWY_RVV_ALL_TRUE
1249
1250// ------------------------------ CountTrue
1251
1252#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1253 template <class D> \
1254 HWY_API size_t CountTrue(D d, HWY_RVV_M(MLEN) m) { \
1255 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1256 return vcpop_m_b##MLEN(m, Lanes(d)); \
1257 }
1258
1260#undef HWY_RVV_COUNT_TRUE
1261
1262// ================================================== MEMORY
1263
1264// ------------------------------ Load
1265
1266#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1267 MLEN, NAME, OP) \
1268 template <size_t N> \
1269 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1270 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1271 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1272 return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, Lanes(d)); \
1273 }
1274HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)
1275#undef HWY_RVV_LOAD
1276
1277// There is no native BF16, treat as uint16_t.
1278template <size_t N, int kPow2>
1281 return Load(RebindToUnsigned<decltype(d)>(),
1282 reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
1283}
1284
1285template <size_t N, int kPow2>
1288 Store(v, RebindToUnsigned<decltype(d)>(),
1289 reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
1290}
1291
1292// ------------------------------ LoadU
1293
1294// RVV only requires lane alignment, not natural alignment of the entire vector.
1295template <class D>
1296HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1297 return Load(d, p);
1298}
1299
1300// ------------------------------ MaskedLoad
1301
1302#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1303 SHIFT, MLEN, NAME, OP) \
1304 template <size_t N> \
1305 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1306 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1307 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1308 return v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, Zero(d), p, Lanes(d)); \
1309 }
1311#undef HWY_RVV_MASKED_LOAD
1312
1313// ------------------------------ Store
1314
1315#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1316 MLEN, NAME, OP) \
1317 template <size_t N> \
1318 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1319 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1320 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1321 return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, Lanes(d)); \
1322 }
1323HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
1324#undef HWY_RVV_STORE
1325
1326// ------------------------------ BlendedStore
1327
1328#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1329 SHIFT, MLEN, NAME, OP) \
1330 template <size_t N> \
1331 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1332 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1333 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1334 return v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, p, v, Lanes(d)); \
1335 }
1337#undef HWY_RVV_BLENDED_STORE
1338
1339namespace detail {
1340
1341#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1342 MLEN, NAME, OP) \
1343 template <size_t N> \
1344 HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \
1345 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, \
1346 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1347 return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, count); \
1348 }
1349HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
1350#undef HWY_RVV_STOREN
1351
1352} // namespace detail
1353
1354// ------------------------------ StoreU
1355
1356// RVV only requires lane alignment, not natural alignment of the entire vector.
1357template <class V, class D>
1358HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1359 Store(v, d, p);
1360}
1361
1362// ------------------------------ Stream
1363template <class V, class D, typename T>
1364HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) {
1365 Store(v, d, aligned);
1366}
1367
1368// ------------------------------ ScatterOffset
1369
1370#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1371 SHIFT, MLEN, NAME, OP) \
1372 template <size_t N> \
1373 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1374 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1375 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1376 HWY_RVV_V(int, SEW, LMUL) offset) { \
1377 return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1378 base, detail::BitCastToUnsigned(offset), v, Lanes(d)); \
1379 }
1381#undef HWY_RVV_SCATTER
1382
1383// ------------------------------ ScatterIndex
1384
1385template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1387 const VFromD<RebindToSigned<D>> index) {
1388 return ScatterOffset(v, d, base, ShiftLeft<2>(index));
1389}
1390
1391template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1392HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
1393 const VFromD<RebindToSigned<D>> index) {
1394 return ScatterOffset(v, d, base, ShiftLeft<3>(index));
1395}
1396
1397// ------------------------------ GatherOffset
1398
1399#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1400 MLEN, NAME, OP) \
1401 template <size_t N> \
1402 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1403 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1404 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1405 HWY_RVV_V(int, SEW, LMUL) offset) { \
1406 return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1407 base, detail::BitCastToUnsigned(offset), Lanes(d)); \
1408 }
1410#undef HWY_RVV_GATHER
1411
1412// ------------------------------ GatherIndex
1413
1414template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1416 const VFromD<RebindToSigned<D>> index) {
1417 return GatherOffset(d, base, ShiftLeft<2>(index));
1418}
1419
1420template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1421HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
1422 const VFromD<RebindToSigned<D>> index) {
1423 return GatherOffset(d, base, ShiftLeft<3>(index));
1424}
1425
1426// ------------------------------ LoadInterleaved2
1427
1428// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1429#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1430#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1431#else
1432#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1433#endif
1434
1435#define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1436 MLEN, NAME, OP) \
1437 template <size_t N> \
1438 HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1439 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \
1440 HWY_RVV_V(BASE, SEW, LMUL) & v0, \
1441 HWY_RVV_V(BASE, SEW, LMUL) & v1) { \
1442 v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, unaligned, Lanes(d)); \
1443 }
1444// Segments are limited to 8 registers, so we can only go up to LMUL=2.
1446#undef HWY_RVV_LOAD2
1447
1448// ------------------------------ LoadInterleaved3
1449
1450#define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1451 MLEN, NAME, OP) \
1452 template <size_t N> \
1453 HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1454 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \
1455 HWY_RVV_V(BASE, SEW, LMUL) & v0, \
1456 HWY_RVV_V(BASE, SEW, LMUL) & v1, \
1457 HWY_RVV_V(BASE, SEW, LMUL) & v2) { \
1458 v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, &v2, unaligned, Lanes(d)); \
1459 }
1460// Segments are limited to 8 registers, so we can only go up to LMUL=2.
1462#undef HWY_RVV_LOAD3
1463
1464// ------------------------------ LoadInterleaved4
1465
1466#define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1467 MLEN, NAME, OP) \
1468 template <size_t N> \
1469 HWY_API void NAME( \
1470 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1471 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned, \
1472 HWY_RVV_V(BASE, SEW, LMUL) & v0, HWY_RVV_V(BASE, SEW, LMUL) & v1, \
1473 HWY_RVV_V(BASE, SEW, LMUL) & v2, HWY_RVV_V(BASE, SEW, LMUL) & v3) { \
1474 v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, &v2, &v3, aligned, \
1475 Lanes(d)); \
1476 }
1477// Segments are limited to 8 registers, so we can only go up to LMUL=2.
1479#undef HWY_RVV_LOAD4
1480
1481// ------------------------------ StoreInterleaved2
1482
1483#define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1484 MLEN, NAME, OP) \
1485 template <size_t N> \
1486 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, \
1487 HWY_RVV_V(BASE, SEW, LMUL) v1, \
1488 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1489 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
1490 v##OP##e##SEW##_v_##CHAR##SEW##LMUL(unaligned, v0, v1, Lanes(d)); \
1491 }
1492// Segments are limited to 8 registers, so we can only go up to LMUL=2.
1494#undef HWY_RVV_STORE2
1495
1496// ------------------------------ StoreInterleaved3
1497
1498#define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1499 MLEN, NAME, OP) \
1500 template <size_t N> \
1501 HWY_API void NAME( \
1502 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
1503 HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1504 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
1505 v##OP##e##SEW##_v_##CHAR##SEW##LMUL(unaligned, v0, v1, v2, Lanes(d)); \
1506 }
1507// Segments are limited to 8 registers, so we can only go up to LMUL=2.
1509#undef HWY_RVV_STORE3
1510
1511// ------------------------------ StoreInterleaved4
1512
1513#define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1514 MLEN, NAME, OP) \
1515 template <size_t N> \
1516 HWY_API void NAME( \
1517 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
1518 HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \
1519 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1520 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) { \
1521 v##OP##e##SEW##_v_##CHAR##SEW##LMUL(aligned, v0, v1, v2, v3, Lanes(d)); \
1522 }
1523// Segments are limited to 8 registers, so we can only go up to LMUL=2.
1525#undef HWY_RVV_STORE4
1526
1527// ================================================== CONVERT
1528
1529// ------------------------------ PromoteTo
1530
1531// SEW is for the input so we can use F16 (no-op if not supported).
1532#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1533 SHIFT, MLEN, NAME, OP) \
1534 template <size_t N> \
1535 HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \
1536 HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1537 return OP##CHAR##SEWD##LMULD(v, Lanes(d)); \
1538 }
1539
1540HWY_RVV_FOREACH_U08(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
1541HWY_RVV_FOREACH_U16(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
1542HWY_RVV_FOREACH_U32(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
1543HWY_RVV_FOREACH_I08(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
1544HWY_RVV_FOREACH_I16(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
1545HWY_RVV_FOREACH_I32(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
1546HWY_RVV_FOREACH_F16(HWY_RVV_PROMOTE, PromoteTo, vfwcvt_f_f_v_, _EXT_VIRT)
1547HWY_RVV_FOREACH_F32(HWY_RVV_PROMOTE, PromoteTo, vfwcvt_f_f_v_, _EXT_VIRT)
1548#undef HWY_RVV_PROMOTE
1549
1550// The above X-macro cannot handle 4x promotion nor type switching.
1551// TODO(janwas): use BASE2 arg to allow the latter.
1552#define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN, \
1553 SHIFT, ADD) \
1554 template <size_t N> \
1555 HWY_API HWY_RVV_V(BASE, BITS, LMUL) \
1556 PromoteTo(HWY_RVV_D(BASE, BITS, N, SHIFT + ADD) d, \
1557 HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \
1558 return OP##CHAR##BITS##LMUL(v, Lanes(d)); \
1559 }
1560
1561#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
1562 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -2, 1) \
1563 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -1, 1) \
1564 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1, 0, 1) \
1565 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2, 1, 1) \
1566 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4, 2, 1)
1567
1568#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
1569 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, mf2, mf8, -3, 2) \
1570 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4, -2, 2) \
1571 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2, -1, 2) \
1572 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1, 0, 2) \
1573 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2, 1, 2)
1574
1575HWY_RVV_PROMOTE_X4(vzext_vf4_, uint, u, 32, uint, 8)
1576HWY_RVV_PROMOTE_X4(vsext_vf4_, int, i, 32, int, 8)
1577
1578// i32 to f64
1579HWY_RVV_PROMOTE_X2(vfwcvt_f_x_v_, float, f, 64, int, 32)
1580
1581#undef HWY_RVV_PROMOTE_X4
1582#undef HWY_RVV_PROMOTE_X2
1583#undef HWY_RVV_PROMOTE
1584
1585// Unsigned to signed: cast for unsigned promote.
1586template <size_t N, int kPow2>
1588 VFromD<Rebind<uint8_t, decltype(d)>> v)
1589 -> VFromD<decltype(d)> {
1590 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1591}
1592
1593template <size_t N, int kPow2>
1595 VFromD<Rebind<uint8_t, decltype(d)>> v)
1596 -> VFromD<decltype(d)> {
1597 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1598}
1599
1600template <size_t N, int kPow2>
1602 VFromD<Rebind<uint16_t, decltype(d)>> v)
1603 -> VFromD<decltype(d)> {
1604 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1605}
1606
1607template <size_t N, int kPow2>
1609 VFromD<Rebind<bfloat16_t, decltype(d)>> v)
1610 -> VFromD<decltype(d)> {
1611 const RebindToSigned<decltype(d)> di32;
1612 const Rebind<uint16_t, decltype(d)> du16;
1613 return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
1614}
1615
1616// ------------------------------ DemoteTo U
1617
1618// SEW is for the source so we can use _DEMOTE.
1619#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1620 MLEN, NAME, OP) \
1621 template <size_t N> \
1622 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
1623 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1624 return OP##CHAR##SEWH##LMULH(v, 0, Lanes(d)); \
1625 } \
1626 template <size_t N> \
1627 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME##Shr16( \
1628 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1629 return OP##CHAR##SEWH##LMULH(v, 16, Lanes(d)); \
1630 }
1631
1632// Unsigned -> unsigned (also used for bf16)
1633namespace detail {
1634HWY_RVV_FOREACH_U16(HWY_RVV_DEMOTE, DemoteTo, vnclipu_wx_, _DEMOTE_VIRT)
1635HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE, DemoteTo, vnclipu_wx_, _DEMOTE_VIRT)
1636} // namespace detail
1637
1638// SEW is for the source so we can use _DEMOTE.
1639#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1640 SHIFT, MLEN, NAME, OP) \
1641 template <size_t N> \
1642 HWY_API HWY_RVV_V(uint, SEWH, LMULH) NAME( \
1643 HWY_RVV_D(uint, SEWH, N, SHIFT - 1) d, HWY_RVV_V(int, SEW, LMUL) v) { \
1644 /* First clamp negative numbers to zero to match x86 packus. */ \
1645 return detail::DemoteTo(d, detail::BitCastToUnsigned(detail::MaxS(v, 0))); \
1646 }
1649#undef HWY_RVV_DEMOTE_I_TO_U
1650
1651template <size_t N>
1652HWY_API vuint8mf8_t DemoteTo(Simd<uint8_t, N, -3> d, const vint32mf2_t v) {
1653 return vnclipu_wx_u8mf8(DemoteTo(Simd<uint16_t, N, -2>(), v), 0, Lanes(d));
1654}
1655template <size_t N>
1656HWY_API vuint8mf4_t DemoteTo(Simd<uint8_t, N, -2> d, const vint32m1_t v) {
1657 return vnclipu_wx_u8mf4(DemoteTo(Simd<uint16_t, N, -1>(), v), 0, Lanes(d));
1658}
1659template <size_t N>
1660HWY_API vuint8mf2_t DemoteTo(Simd<uint8_t, N, -1> d, const vint32m2_t v) {
1661 return vnclipu_wx_u8mf2(DemoteTo(Simd<uint16_t, N, 0>(), v), 0, Lanes(d));
1662}
1663template <size_t N>
1664HWY_API vuint8m1_t DemoteTo(Simd<uint8_t, N, 0> d, const vint32m4_t v) {
1665 return vnclipu_wx_u8m1(DemoteTo(Simd<uint16_t, N, 1>(), v), 0, Lanes(d));
1666}
1667template <size_t N>
1668HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vint32m8_t v) {
1669 return vnclipu_wx_u8m2(DemoteTo(Simd<uint16_t, N, 2>(), v), 0, Lanes(d));
1670}
1671
1672HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) {
1673 const size_t avl = Lanes(ScalableTag<uint8_t, -3>());
1674 return vnclipu_wx_u8mf8(vnclipu_wx_u16mf4(v, 0, avl), 0, avl);
1675}
1676HWY_API vuint8mf4_t U8FromU32(const vuint32m1_t v) {
1677 const size_t avl = Lanes(ScalableTag<uint8_t, -2>());
1678 return vnclipu_wx_u8mf4(vnclipu_wx_u16mf2(v, 0, avl), 0, avl);
1679}
1680HWY_API vuint8mf2_t U8FromU32(const vuint32m2_t v) {
1681 const size_t avl = Lanes(ScalableTag<uint8_t, -1>());
1682 return vnclipu_wx_u8mf2(vnclipu_wx_u16m1(v, 0, avl), 0, avl);
1683}
1684HWY_API vuint8m1_t U8FromU32(const vuint32m4_t v) {
1685 const size_t avl = Lanes(ScalableTag<uint8_t, 0>());
1686 return vnclipu_wx_u8m1(vnclipu_wx_u16m2(v, 0, avl), 0, avl);
1687}
1688HWY_API vuint8m2_t U8FromU32(const vuint32m8_t v) {
1689 const size_t avl = Lanes(ScalableTag<uint8_t, 1>());
1690 return vnclipu_wx_u8m2(vnclipu_wx_u16m4(v, 0, avl), 0, avl);
1691}
1692
1693// ------------------------------ DemoteTo I
1694
1695HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT)
1696HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT)
1697
1698template <size_t N>
1699HWY_API vint8mf8_t DemoteTo(Simd<int8_t, N, -3> d, const vint32mf2_t v) {
1701}
1702template <size_t N>
1703HWY_API vint8mf4_t DemoteTo(Simd<int8_t, N, -2> d, const vint32m1_t v) {
1705}
1706template <size_t N>
1707HWY_API vint8mf2_t DemoteTo(Simd<int8_t, N, -1> d, const vint32m2_t v) {
1709}
1710template <size_t N>
1711HWY_API vint8m1_t DemoteTo(Simd<int8_t, N, 0> d, const vint32m4_t v) {
1713}
1714template <size_t N>
1715HWY_API vint8m2_t DemoteTo(Simd<int8_t, N, 1> d, const vint32m8_t v) {
1717}
1718
1719#undef HWY_RVV_DEMOTE
1720
1721// ------------------------------ DemoteTo F
1722
1723// SEW is for the source so we can use _DEMOTE.
1724#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1725 SHIFT, MLEN, NAME, OP) \
1726 template <size_t N> \
1727 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
1728 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1729 return OP##SEWH##LMULH(v, Lanes(d)); \
1730 }
1731
1732#if HWY_HAVE_FLOAT16
1733HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, vfncvt_rod_f_f_w_f,
1734 _DEMOTE_VIRT)
1735#endif
1737 _DEMOTE_VIRT)
1738#undef HWY_RVV_DEMOTE_F
1739
1740// TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F.
1741template <size_t N>
1742HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -2> d, const vfloat64m1_t v) {
1743 return vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
1744}
1745template <size_t N>
1746HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -1> d, const vfloat64m1_t v) {
1747 return vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
1748}
1749template <size_t N>
1750HWY_API vint32m1_t DemoteTo(Simd<int32_t, N, 0> d, const vfloat64m2_t v) {
1751 return vfncvt_rtz_x_f_w_i32m1(v, Lanes(d));
1752}
1753template <size_t N>
1754HWY_API vint32m2_t DemoteTo(Simd<int32_t, N, 1> d, const vfloat64m4_t v) {
1755 return vfncvt_rtz_x_f_w_i32m2(v, Lanes(d));
1756}
1757template <size_t N>
1758HWY_API vint32m4_t DemoteTo(Simd<int32_t, N, 2> d, const vfloat64m8_t v) {
1759 return vfncvt_rtz_x_f_w_i32m4(v, Lanes(d));
1760}
1761
1762template <size_t N, int kPow2>
1765 const RebindToUnsigned<decltype(d)> du16;
1766 const Rebind<uint32_t, decltype(d)> du32;
1767 return detail::DemoteToShr16(du16, BitCast(du32, v));
1768}
1769
1770// ------------------------------ ConvertTo F
1771
1772#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1773 SHIFT, MLEN, NAME, OP) \
1774 template <size_t N> \
1775 HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \
1776 HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) { \
1777 return vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d)); \
1778 } \
1779 /* Truncates (rounds toward zero). */ \
1780 template <size_t N> \
1781 HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \
1782 HWY_RVV_V(BASE, SEW, LMUL) v) { \
1783 return vfcvt_rtz_x_f_v_i##SEW##LMUL(v, Lanes(d)); \
1784 } \
1785// API only requires f32 but we provide f64 for internal use.
1786HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT)
1787#undef HWY_RVV_CONVERT
1788
1789// Uses default rounding mode. Must be separate because there is no D arg.
1790#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1791 SHIFT, MLEN, NAME, OP) \
1792 HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1793 return vfcvt_x_f_v_i##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
1794 }
1796#undef HWY_RVV_NEAREST
1797
1798// ================================================== COMBINE
1799
1800namespace detail {
1801
1802// For x86-compatible behaviour mandated by Highway API: TableLookupBytes
1803// offsets are implicitly relative to the start of their 128-bit block.
1804template <typename T, size_t N, int kPow2>
1806 size_t lpb = 16 / sizeof(T);
1807 if (IsFull(d)) return lpb;
1808 // Also honor the user-specified (constexpr) N limit.
1809 lpb = HWY_MIN(lpb, N);
1810 // No fraction, we're done.
1811 if (kPow2 >= 0) return lpb;
1812 // Fractional LMUL: Lanes(d) may be smaller than lpb, so honor that.
1813 return HWY_MIN(lpb, Lanes(d));
1814}
1815
1816template <class D, class V>
1817HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
1818 using T = MakeUnsigned<TFromD<D>>;
1819 return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
1820}
1821
1822template <size_t kLanes, class D>
1824 const RebindToUnsigned<D> du;
1825 const RebindToSigned<D> di;
1826 const auto idx_mod = AndS(Iota0(du), LanesPerBlock(du) - 1);
1827 return LtS(BitCast(di, idx_mod), static_cast<TFromD<decltype(di)>>(kLanes));
1828}
1829
1830// vector = f(vector, vector, size_t)
1831#define HWY_RVV_SLIDE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1832 MLEN, NAME, OP) \
1833 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1834 NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \
1835 size_t lanes) { \
1836 return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes, \
1837 HWY_RVV_AVL(SEW, SHIFT)); \
1838 }
1839
1840HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideUp, slideup, _ALL)
1841HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideDown, slidedown, _ALL)
1842
1843#undef HWY_RVV_SLIDE
1844
1845} // namespace detail
1846
1847// ------------------------------ ConcatUpperLower
1848template <class D, class V>
1849HWY_API V ConcatUpperLower(D d, const V hi, const V lo) {
1850 return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
1851}
1852
1853// ------------------------------ ConcatLowerLower
1854template <class D, class V>
1855HWY_API V ConcatLowerLower(D d, const V hi, const V lo) {
1856 return detail::SlideUp(lo, hi, Lanes(d) / 2);
1857}
1858
1859// ------------------------------ ConcatUpperUpper
1860template <class D, class V>
1861HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) {
1862 // Move upper half into lower
1863 const auto lo_down = detail::SlideDown(lo, lo, Lanes(d) / 2);
1864 return ConcatUpperLower(d, hi, lo_down);
1865}
1866
1867// ------------------------------ ConcatLowerUpper
1868template <class D, class V>
1869HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) {
1870 // Move half of both inputs to the other half
1871 const auto hi_up = detail::SlideUp(hi, hi, Lanes(d) / 2);
1872 const auto lo_down = detail::SlideDown(lo, lo, Lanes(d) / 2);
1873 return ConcatUpperLower(d, hi_up, lo_down);
1874}
1875
1876// ------------------------------ Combine
1877template <class D2, class V>
1878HWY_API VFromD<D2> Combine(D2 d2, const V hi, const V lo) {
1879 return detail::SlideUp(detail::Ext(d2, lo), detail::Ext(d2, hi),
1880 Lanes(d2) / 2);
1881}
1882
1883// ------------------------------ ZeroExtendVector
1884
1885template <class D2, class V>
1887 return Combine(d2, Xor(lo, lo), lo);
1888}
1889
1890// ------------------------------ Lower/UpperHalf
1891
1892namespace detail {
1893
1894// RVV may only support LMUL >= SEW/64; returns whether that holds for D. Note
1895// that SEW = sizeof(T)*8 and LMUL = 1 << Pow2().
1896template <class D>
1897constexpr bool IsSupportedLMUL(D d) {
1898 return (size_t{1} << (Pow2(d) + 3)) >= sizeof(TFromD<D>);
1899}
1900
1901} // namespace detail
1902
1903// If IsSupportedLMUL, just 'truncate' i.e. halve LMUL.
1904template <class DH, hwy::EnableIf<detail::IsSupportedLMUL(DH())>* = nullptr>
1905HWY_API VFromD<DH> LowerHalf(const DH /* tag */, const VFromD<Twice<DH>> v) {
1906 return detail::Trunc(v);
1907}
1908
1909// Otherwise, there is no corresponding intrinsic type (e.g. vuint64mf2_t), and
1910// the hardware may set "vill" if we attempt such an LMUL. However, the V
1911// extension on application processors requires Zvl128b, i.e. VLEN >= 128, so it
1912// still makes sense to have half of an SEW=64 vector. We instead just return
1913// the vector, and rely on the kPow2 in DH to halve the return value of Lanes().
1914template <class DH, class V,
1915 hwy::EnableIf<!detail::IsSupportedLMUL(DH())>* = nullptr>
1916HWY_API V LowerHalf(const DH /* tag */, const V v) {
1917 return v;
1918}
1919
1920// Same, but without D arg
1921template <class V>
1923 return LowerHalf(Half<DFromV<V>>(), v);
1924}
1925
1926template <class DH>
1928 return LowerHalf(d2, detail::SlideDown(v, v, Lanes(d2)));
1929}
1930
1931// ================================================== SWIZZLE
1932
1933namespace detail {
1934// Special instruction for 1 lane is presumably faster?
1935#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1936 MLEN, NAME, OP) \
1937 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1938 return v##OP##_##CHAR##SEW##LMUL(v, 0, HWY_RVV_AVL(SEW, SHIFT)); \
1939 }
1940
1941HWY_RVV_FOREACH_UI3264(HWY_RVV_SLIDE1, Slide1Up, slide1up_vx, _ALL)
1942HWY_RVV_FOREACH_F3264(HWY_RVV_SLIDE1, Slide1Up, fslide1up_vf, _ALL)
1943HWY_RVV_FOREACH_UI3264(HWY_RVV_SLIDE1, Slide1Down, slide1down_vx, _ALL)
1944HWY_RVV_FOREACH_F3264(HWY_RVV_SLIDE1, Slide1Down, fslide1down_vf, _ALL)
1945#undef HWY_RVV_SLIDE1
1946} // namespace detail
1947
1948// ------------------------------ GetLane
1949
1950#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1951 SHIFT, MLEN, NAME, OP) \
1952 HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1953 return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); /* no AVL */ \
1954 }
1955
1958#undef HWY_RVV_GET_LANE
1959
1960// ------------------------------ ExtractLane
1961template <class V>
1962HWY_API TFromV<V> ExtractLane(const V v, size_t i) {
1963 return GetLane(detail::SlideDown(v, v, i));
1964}
1965
1966// ------------------------------ InsertLane
1967
1968template <class V, HWY_IF_NOT_LANE_SIZE_V(V, 1)>
1969HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
1970 const DFromV<V> d;
1971 const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
1972 using TU = TFromD<decltype(du)>;
1973 const auto is_i = detail::EqS(detail::Iota0(du), static_cast<TU>(i));
1974 return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
1975}
1976
1977namespace detail {
1978HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetOnlyFirst, sof)
1979} // namespace detail
1980
1981// For 8-bit lanes, Iota0 might overflow.
1982template <class V, HWY_IF_LANE_SIZE_V(V, 1)>
1983HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
1984 const DFromV<V> d;
1985 const auto zero = Zero(d);
1986 const auto one = Set(d, 1);
1987 const auto ge_i = Eq(detail::SlideUp(zero, one, i), one);
1988 const auto is_i = detail::SetOnlyFirst(ge_i);
1989 return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
1990}
1991
1992// ------------------------------ OddEven
1993template <class V>
1994HWY_API V OddEven(const V a, const V b) {
1995 const RebindToUnsigned<DFromV<V>> du; // Iota0 is unsigned only
1996 const auto is_even = detail::EqS(detail::AndS(detail::Iota0(du), 1), 0);
1997 return IfThenElse(is_even, b, a);
1998}
1999
2000// ------------------------------ DupEven (OddEven)
2001template <class V>
2002HWY_API V DupEven(const V v) {
2003 const V up = detail::Slide1Up(v);
2004 return OddEven(up, v);
2005}
2006
2007// ------------------------------ DupOdd (OddEven)
2008template <class V>
2009HWY_API V DupOdd(const V v) {
2010 const V down = detail::Slide1Down(v);
2011 return OddEven(v, down);
2012}
2013
2014// ------------------------------ OddEvenBlocks
2015template <class V>
2016HWY_API V OddEvenBlocks(const V a, const V b) {
2017 const RebindToUnsigned<DFromV<V>> du; // Iota0 is unsigned only
2018 constexpr size_t kShift = CeilLog2(16 / sizeof(TFromV<V>));
2019 const auto idx_block = ShiftRight<kShift>(detail::Iota0(du));
2020 const auto is_even = detail::EqS(detail::AndS(idx_block, 1), 0);
2021 return IfThenElse(is_even, b, a);
2022}
2023
2024// ------------------------------ SwapAdjacentBlocks
2025
2026template <class V>
2027HWY_API V SwapAdjacentBlocks(const V v) {
2028 const DFromV<V> d;
2029 const size_t lpb = detail::LanesPerBlock(d);
2030 const V down = detail::SlideDown(v, v, lpb);
2031 const V up = detail::SlideUp(v, v, lpb);
2032 return OddEvenBlocks(up, down);
2033}
2034
2035// ------------------------------ TableLookupLanes
2036
2037template <class D, class VI>
2038HWY_API VFromD<RebindToUnsigned<D>> IndicesFromVec(D d, VI vec) {
2039 static_assert(sizeof(TFromD<D>) == sizeof(TFromV<VI>), "Index != lane");
2040 const RebindToUnsigned<decltype(d)> du; // instead of <D>: avoids unused d.
2041 const auto indices = BitCast(du, vec);
2042#if HWY_IS_DEBUG_BUILD
2043 HWY_DASSERT(AllTrue(du, detail::LtS(indices, Lanes(d))));
2044#endif
2045 return indices;
2046}
2047
2048template <class D, typename TI>
2049HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
2050 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
2051 return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
2052}
2053
2054// <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX
2055// to 2048! We could instead use vrgatherei16.
2056#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
2057 MLEN, NAME, OP) \
2058 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2059 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \
2060 return v##OP##_vv_##CHAR##SEW##LMUL(v, idx, HWY_RVV_AVL(SEW, SHIFT)); \
2061 }
2062
2064#undef HWY_RVV_TABLE
2065
2066// ------------------------------ ConcatOdd (TableLookupLanes)
2067template <class D, class V>
2068HWY_API V ConcatOdd(D d, const V hi, const V lo) {
2069 const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
2070 const auto iota = detail::Iota0(du);
2071 const auto idx = detail::AddS(Add(iota, iota), 1);
2072 const auto lo_odd = TableLookupLanes(lo, idx);
2073 const auto hi_odd = TableLookupLanes(hi, idx);
2074 return detail::SlideUp(lo_odd, hi_odd, Lanes(d) / 2);
2075}
2076
2077// ------------------------------ ConcatEven (TableLookupLanes)
2078template <class D, class V>
2079HWY_API V ConcatEven(D d, const V hi, const V lo) {
2080 const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
2081 const auto iota = detail::Iota0(du);
2082 const auto idx = Add(iota, iota);
2083 const auto lo_even = TableLookupLanes(lo, idx);
2084 const auto hi_even = TableLookupLanes(hi, idx);
2085 return detail::SlideUp(lo_even, hi_even, Lanes(d) / 2);
2086}
2087
2088// ------------------------------ Reverse (TableLookupLanes)
2089template <class D>
2091 const RebindToUnsigned<D> du;
2092 using TU = TFromD<decltype(du)>;
2093 const size_t N = Lanes(du);
2094 const auto idx =
2095 detail::ReverseSubS(detail::Iota0(du), static_cast<TU>(N - 1));
2096 return TableLookupLanes(v, idx);
2097}
2098
2099// ------------------------------ Reverse2 (RotateRight, OddEven)
2100
2101// Shifting and adding requires fewer instructions than blending, but casting to
2102// u32 only works for LMUL in [1/2, 8].
2103template <class D, HWY_IF_LANE_SIZE_D(D, 2), HWY_RVV_IF_POW2_IN(D, -1, 3)>
2105 const Repartition<uint32_t, D> du32;
2106 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
2107}
2108// For LMUL < 1/2, we can extend and then truncate.
2109template <class D, HWY_IF_LANE_SIZE_D(D, 2), HWY_RVV_IF_POW2_IN(D, -3, -2)>
2110HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2111 const Twice<decltype(d)> d2;
2112 const Twice<decltype(d2)> d4;
2113 const Repartition<uint32_t, decltype(d4)> du32;
2114 const auto vx = detail::Ext(d4, detail::Ext(d2, v));
2115 const auto rx = BitCast(d4, RotateRight<16>(BitCast(du32, vx)));
2116 return detail::Trunc(detail::Trunc(rx));
2117}
2118
2119// Shifting and adding requires fewer instructions than blending, but casting to
2120// u64 does not work for LMUL < 1.
2121template <class D, HWY_IF_LANE_SIZE_D(D, 4), HWY_RVV_IF_POW2_IN(D, 0, 3)>
2122HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2123 const Repartition<uint64_t, decltype(d)> du64;
2124 return BitCast(d, RotateRight<32>(BitCast(du64, v)));
2125}
2126
2127// For fractions, we can extend and then truncate.
2128template <class D, HWY_IF_LANE_SIZE_D(D, 4), HWY_RVV_IF_POW2_IN(D, -2, -1)>
2129HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2130 const Twice<decltype(d)> d2;
2131 const Twice<decltype(d2)> d4;
2132 const Repartition<uint64_t, decltype(d4)> du64;
2133 const auto vx = detail::Ext(d4, detail::Ext(d2, v));
2134 const auto rx = BitCast(d4, RotateRight<32>(BitCast(du64, vx)));
2135 return detail::Trunc(detail::Trunc(rx));
2136}
2137
2138template <class D, class V = VFromD<D>, HWY_IF_LANE_SIZE_D(D, 8)>
2139HWY_API V Reverse2(D /* tag */, const V v) {
2140 const V up = detail::Slide1Up(v);
2141 const V down = detail::Slide1Down(v);
2142 return OddEven(up, down);
2143}
2144
2145// ------------------------------ Reverse4 (TableLookupLanes)
2146
2147template <class D>
2148HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
2149 const RebindToUnsigned<D> du;
2150 const auto idx = detail::XorS(detail::Iota0(du), 3);
2151 return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
2152}
2153
2154// ------------------------------ Reverse8 (TableLookupLanes)
2155
2156template <class D>
2157HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
2158 const RebindToUnsigned<D> du;
2159 const auto idx = detail::XorS(detail::Iota0(du), 7);
2160 return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
2161}
2162
2163// ------------------------------ ReverseBlocks (Reverse, Shuffle01)
2164template <class D, class V = VFromD<D>>
2165HWY_API V ReverseBlocks(D d, V v) {
2166 const Repartition<uint64_t, D> du64;
2167 const size_t N = Lanes(du64);
2168 const auto rev =
2169 detail::ReverseSubS(detail::Iota0(du64), static_cast<uint64_t>(N - 1));
2170 // Swap lo/hi u64 within each block
2171 const auto idx = detail::XorS(rev, 1);
2172 return BitCast(d, TableLookupLanes(BitCast(du64, v), idx));
2173}
2174
2175// ------------------------------ Compress
2176
2177template <typename T>
2178struct CompressIsPartition {
2179 enum { value = 0 };
2180};
2181
2182#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2183 SHIFT, MLEN, NAME, OP) \
2184 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2185 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \
2186 return v##OP##_vm_##CHAR##SEW##LMUL(mask, v, v, HWY_RVV_AVL(SEW, SHIFT)); \
2187 }
2188
2191#undef HWY_RVV_COMPRESS
2192
2193// ------------------------------ CompressNot
2194template <class V, class M>
2195HWY_API V CompressNot(V v, const M mask) {
2196 return Compress(v, Not(mask));
2197}
2198
2199// ------------------------------ CompressBlocksNot
2200template <class V, class M>
2201HWY_API V CompressBlocksNot(V v, const M mask) {
2202 return CompressNot(v, mask);
2203}
2204
2205// ------------------------------ CompressStore
2206template <class V, class M, class D>
2207HWY_API size_t CompressStore(const V v, const M mask, const D d,
2208 TFromD<D>* HWY_RESTRICT unaligned) {
2209 StoreU(Compress(v, mask), d, unaligned);
2210 return CountTrue(d, mask);
2211}
2212
2213// ------------------------------ CompressBlendedStore
2214template <class V, class M, class D>
2215HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
2216 TFromD<D>* HWY_RESTRICT unaligned) {
2217 const size_t count = CountTrue(d, mask);
2218 detail::StoreN(count, Compress(v, mask), d, unaligned);
2219 return count;
2220}
2221
2222// ================================================== BLOCKWISE
2223
2224// ------------------------------ CombineShiftRightBytes
2225template <size_t kBytes, class D, class V = VFromD<D>>
2226HWY_API V CombineShiftRightBytes(const D d, const V hi, V lo) {
2227 const Repartition<uint8_t, decltype(d)> d8;
2228 const auto hi8 = BitCast(d8, hi);
2229 const auto lo8 = BitCast(d8, lo);
2230 const auto hi_up = detail::SlideUp(hi8, hi8, 16 - kBytes);
2231 const auto lo_down = detail::SlideDown(lo8, lo8, kBytes);
2232 const auto is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
2233 return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
2234}
2235
2236// ------------------------------ CombineShiftRightLanes
2237template <size_t kLanes, class D, class V = VFromD<D>>
2238HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo) {
2239 constexpr size_t kLanesUp = 16 / sizeof(TFromV<V>) - kLanes;
2240 const auto hi_up = detail::SlideUp(hi, hi, kLanesUp);
2241 const auto lo_down = detail::SlideDown(lo, lo, kLanes);
2242 const auto is_lo = detail::FirstNPerBlock<kLanesUp>(d);
2243 return IfThenElse(is_lo, lo_down, hi_up);
2244}
2245
2246// ------------------------------ Shuffle2301 (ShiftLeft)
2247template <class V>
2248HWY_API V Shuffle2301(const V v) {
2249 const DFromV<V> d;
2250 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2251 const Repartition<uint64_t, decltype(d)> du64;
2252 const auto v64 = BitCast(du64, v);
2253 return BitCast(d, Or(ShiftRight<32>(v64), ShiftLeft<32>(v64)));
2254}
2255
2256// ------------------------------ Shuffle2103
2257template <class V>
2258HWY_API V Shuffle2103(const V v) {
2259 const DFromV<V> d;
2260 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2261 return CombineShiftRightLanes<3>(d, v, v);
2262}
2263
2264// ------------------------------ Shuffle0321
2265template <class V>
2266HWY_API V Shuffle0321(const V v) {
2267 const DFromV<V> d;
2268 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2269 return CombineShiftRightLanes<1>(d, v, v);
2270}
2271
2272// ------------------------------ Shuffle1032
2273template <class V>
2274HWY_API V Shuffle1032(const V v) {
2275 const DFromV<V> d;
2276 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2277 return CombineShiftRightLanes<2>(d, v, v);
2278}
2279
2280// ------------------------------ Shuffle01
2281template <class V>
2282HWY_API V Shuffle01(const V v) {
2283 const DFromV<V> d;
2284 static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
2285 return CombineShiftRightLanes<1>(d, v, v);
2286}
2287
2288// ------------------------------ Shuffle0123
2289template <class V>
2290HWY_API V Shuffle0123(const V v) {
2291 return Shuffle2301(Shuffle1032(v));
2292}
2293
2294// ------------------------------ TableLookupBytes
2295
2296// Extends or truncates a vector to match the given d.
2297namespace detail {
2298
2299template <typename T, size_t N, int kPow2>
2301 -> VFromD<decltype(d)> {
2302 const Simd<T, N, kPow2 - 1> dh;
2303 const Simd<T, N, kPow2 - 2> dhh;
2304 return Ext(d, Ext(dh, Ext(dhh, v)));
2305}
2306template <typename T, size_t N, int kPow2>
2308 -> VFromD<decltype(d)> {
2309 const Simd<T, N, kPow2 - 1> dh;
2310 return Ext(d, Ext(dh, v));
2311}
2312template <typename T, size_t N, int kPow2>
2314 -> VFromD<decltype(d)> {
2315 return Ext(d, v);
2316}
2317
2318template <typename T, size_t N, int kPow2>
2320 -> VFromD<decltype(d)> {
2321 return v;
2322}
2323
2324template <typename T, size_t N, int kPow2>
2326 -> VFromD<decltype(d)> {
2327 return Trunc(v);
2328}
2329template <typename T, size_t N, int kPow2>
2331 -> VFromD<decltype(d)> {
2332 return Trunc(Trunc(v));
2333}
2334template <typename T, size_t N, int kPow2>
2336 -> VFromD<decltype(d)> {
2337 return Trunc(Trunc(Trunc(v)));
2338}
2339
2340} // namespace detail
2341
2342template <class VT, class VI>
2343HWY_API VI TableLookupBytes(const VT vt, const VI vi) {
2344 const DFromV<VT> dt; // T=table, I=index.
2345 const DFromV<VI> di;
2346 const Repartition<uint8_t, decltype(dt)> dt8;
2347 const Repartition<uint8_t, decltype(di)> di8;
2348 // Required for producing half-vectors with table lookups from a full vector.
2349 // If we instead run at the LMUL of the index vector, lookups into the table
2350 // would be truncated. Thus we run at the larger of the two LMULs and truncate
2351 // the result vector to the original index LMUL.
2352 constexpr int kPow2T = Pow2(dt8);
2353 constexpr int kPow2I = Pow2(di8);
2354 const Simd<uint8_t, MaxLanes(di8), HWY_MAX(kPow2T, kPow2I)> dm8; // m=max
2355 const auto vmt = detail::ChangeLMUL(dm8, BitCast(dt8, vt));
2356 const auto vmi = detail::ChangeLMUL(dm8, BitCast(di8, vi));
2357 auto offsets = detail::OffsetsOf128BitBlocks(dm8, detail::Iota0(dm8));
2358 // If the table is shorter, wrap around offsets so they do not reference
2359 // undefined lanes in the newly extended vmt.
2360 if (kPow2T < kPow2I) {
2361 offsets = detail::AndS(offsets, Lanes(dt8) - 1);
2362 }
2363 const auto out = TableLookupLanes(vmt, Add(vmi, offsets));
2364 return BitCast(di, detail::ChangeLMUL(di8, out));
2365}
2366
2367template <class VT, class VI>
2368HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) {
2369 const DFromV<VI> di;
2370 const Repartition<int8_t, decltype(di)> di8;
2371 const auto idx8 = BitCast(di8, idx);
2372 const auto lookup = TableLookupBytes(vt, idx8);
2373 return BitCast(di, IfThenZeroElse(detail::LtS(idx8, 0), lookup));
2374}
2375
2376// ------------------------------ Broadcast
2377template <int kLane, class V>
2378HWY_API V Broadcast(const V v) {
2379 const DFromV<V> d;
2380 HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
2382 if (kLane != 0) {
2383 idx = detail::AddS(idx, kLane);
2384 }
2385 return TableLookupLanes(v, idx);
2386}
2387
2388// ------------------------------ ShiftLeftLanes
2389
2390template <size_t kLanes, class D, class V = VFromD<D>>
2391HWY_API V ShiftLeftLanes(const D d, const V v) {
2392 const RebindToSigned<decltype(d)> di;
2393 using TI = TFromD<decltype(di)>;
2394 const auto shifted = detail::SlideUp(v, v, kLanes);
2395 // Match x86 semantics by zeroing lower lanes in 128-bit blocks
2396 const auto idx_mod =
2397 detail::AndS(detail::Iota0(di), detail::LanesPerBlock(di) - 1);
2398 const auto clear = detail::LtS(BitCast(di, idx_mod), static_cast<TI>(kLanes));
2399 return IfThenZeroElse(clear, shifted);
2400}
2401
2402template <size_t kLanes, class V>
2403HWY_API V ShiftLeftLanes(const V v) {
2404 return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
2405}
2406
2407// ------------------------------ ShiftLeftBytes
2408
2409template <int kBytes, class D>
2411 const Repartition<uint8_t, decltype(d)> d8;
2412 return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
2413}
2414
2415template <int kBytes, class V>
2416HWY_API V ShiftLeftBytes(const V v) {
2417 return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
2418}
2419
2420// ------------------------------ ShiftRightLanes
2421template <size_t kLanes, typename T, size_t N, int kPow2,
2422 class V = VFromD<Simd<T, N, kPow2>>>
2424 const RebindToSigned<decltype(d)> di;
2425 using TI = TFromD<decltype(di)>;
2426 // For partial vectors, clear upper lanes so we shift in zeros.
2427 if (N <= 16 / sizeof(T)) {
2428 v = IfThenElseZero(FirstN(d, N), v);
2429 }
2430
2431 const auto shifted = detail::SlideDown(v, v, kLanes);
2432 // Match x86 semantics by zeroing upper lanes in 128-bit blocks
2433 const size_t lpb = detail::LanesPerBlock(di);
2434 const auto idx_mod = detail::AndS(detail::Iota0(di), lpb - 1);
2435 const auto keep =
2436 detail::LtS(BitCast(di, idx_mod), static_cast<TI>(lpb - kLanes));
2437 return IfThenElseZero(keep, shifted);
2438}
2439
2440// ------------------------------ ShiftRightBytes
2441template <int kBytes, class D, class V = VFromD<D>>
2442HWY_API V ShiftRightBytes(const D d, const V v) {
2443 const Repartition<uint8_t, decltype(d)> d8;
2444 return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
2445}
2446
2447// ------------------------------ InterleaveLower
2448
2449template <class D, class V>
2450HWY_API V InterleaveLower(D d, const V a, const V b) {
2451 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
2452 const RebindToUnsigned<decltype(d)> du;
2453 const auto i = detail::Iota0(du);
2454 const auto idx_mod =
2455 ShiftRight<1>(detail::AndS(i, detail::LanesPerBlock(du) - 1));
2456 const auto idx = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
2457 const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
2458 return IfThenElse(is_even, TableLookupLanes(a, idx),
2459 TableLookupLanes(b, idx));
2460}
2461
2462template <class V>
2463HWY_API V InterleaveLower(const V a, const V b) {
2464 return InterleaveLower(DFromV<V>(), a, b);
2465}
2466
2467// ------------------------------ InterleaveUpper
2468
2469template <class D, class V>
2470HWY_API V InterleaveUpper(const D d, const V a, const V b) {
2471 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
2472 const RebindToUnsigned<decltype(d)> du;
2473 const size_t lpb = detail::LanesPerBlock(du);
2474 const auto i = detail::Iota0(du);
2475 const auto idx_mod = ShiftRight<1>(detail::AndS(i, lpb - 1));
2476 const auto idx_lower = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
2477 const auto idx = detail::AddS(idx_lower, lpb / 2);
2478 const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
2479 return IfThenElse(is_even, TableLookupLanes(a, idx),
2480 TableLookupLanes(b, idx));
2481}
2482
2483// ------------------------------ ZipLower
2484
2485template <class V, class DW = RepartitionToWide<DFromV<V>>>
2486HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2487 const RepartitionToNarrow<DW> dn;
2488 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2489 return BitCast(dw, InterleaveLower(dn, a, b));
2490}
2491
2492template <class V, class DW = RepartitionToWide<DFromV<V>>>
2493HWY_API VFromD<DW> ZipLower(V a, V b) {
2494 return BitCast(DW(), InterleaveLower(a, b));
2495}
2496
2497// ------------------------------ ZipUpper
2498template <class DW, class V>
2499HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2500 const RepartitionToNarrow<DW> dn;
2501 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2502 return BitCast(dw, InterleaveUpper(dn, a, b));
2503}
2504
2505// ================================================== REDUCE
2506
2507// vector = f(vector, zero_m1)
2508#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
2509 MLEN, NAME, OP) \
2510 template <class D> \
2511 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2512 NAME(D d, HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) { \
2513 return Set(d, GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \
2514 v0, v, v0, Lanes(d)))); \
2515 }
2516
2517// ------------------------------ SumOfLanes
2518
2519namespace detail {
2520HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL)
2521HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL)
2522} // namespace detail
2523
2524template <class D>
2526 const auto v0 = Zero(ScalableTag<TFromD<D>>()); // always m1
2527 return detail::RedSum(d, v, v0);
2528}
2529
2530// ------------------------------ MinOfLanes
2531namespace detail {
2532HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL)
2533HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL)
2534HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL)
2535} // namespace detail
2536
2537template <class D>
2539 using T = TFromD<D>;
2540 const ScalableTag<T> d1; // always m1
2541 const auto neutral = Set(d1, HighestValue<T>());
2542 return detail::RedMin(d, v, neutral);
2543}
2544
2545// ------------------------------ MaxOfLanes
2546namespace detail {
2547HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL)
2548HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL)
2549HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL)
2550} // namespace detail
2551
2552template <class D>
2554 using T = TFromD<D>;
2555 const ScalableTag<T> d1; // always m1
2556 const auto neutral = Set(d1, LowestValue<T>());
2557 return detail::RedMax(d, v, neutral);
2558}
2559
2560#undef HWY_RVV_REDUCE
2561
2562// ================================================== Ops with dependencies
2563
2564// ------------------------------ PopulationCount (ShiftRight)
2565
2566// Handles LMUL >= 2 or capped vectors, which generic_ops-inl cannot.
2567template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
2568 hwy::EnableIf<Pow2(D()) < 1 || MaxLanes(D()) < 16>* = nullptr>
2569HWY_API V PopulationCount(V v) {
2570 // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
2571 v = Sub(v, detail::AndS(ShiftRight<1>(v), 0x55));
2572 v = Add(detail::AndS(ShiftRight<2>(v), 0x33), detail::AndS(v, 0x33));
2573 return detail::AndS(Add(v, ShiftRight<4>(v)), 0x0F);
2574}
2575
2576// ------------------------------ LoadDup128
2577
2578template <class D>
2579HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
2580 const auto loaded = Load(d, p);
2581 // Broadcast the first block
2582 const auto idx = detail::AndS(detail::Iota0(d), detail::LanesPerBlock(d) - 1);
2583 return TableLookupLanes(loaded, idx);
2584}
2585
2586// ------------------------------ LoadMaskBits
2587
2588// Support all combinations of T and SHIFT(LMUL) without explicit overloads for
2589// each. First overload for MLEN=1..64.
2590namespace detail {
2591
2592// Maps D to MLEN (wrapped in SizeTag), such that #mask_bits = VLEN/MLEN. MLEN
2593// increases with lane size and decreases for increasing LMUL. Cap at 64, the
2594// largest supported by HWY_RVV_FOREACH_B (and intrinsics), for virtual LMUL
2595// e.g. vuint16mf8_t: (8*2 << 3) == 128.
2596template <class D>
2597using MaskTag = hwy::SizeTag<HWY_MIN(
2598 64, detail::ScaleByPower(8 * sizeof(TFromD<D>), -Pow2(D())))>;
2599
2600#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \
2601 HWY_INLINE HWY_RVV_M(MLEN) \
2602 NAME(hwy::SizeTag<MLEN> /* tag */, const uint8_t* bits, size_t N) { \
2603 return OP##_v_b##MLEN(bits, N); \
2604 }
2606#undef HWY_RVV_LOAD_MASK_BITS
2607} // namespace detail
2608
2609template <class D, class MT = detail::MaskTag<D>>
2610HWY_API auto LoadMaskBits(D d, const uint8_t* bits)
2611 -> decltype(detail::LoadMaskBits(MT(), bits, Lanes(d))) {
2612 return detail::LoadMaskBits(MT(), bits, Lanes(d));
2613}
2614
2615// ------------------------------ StoreMaskBits
2616#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \
2617 template <class D> \
2618 HWY_API size_t NAME(D d, HWY_RVV_M(MLEN) m, uint8_t* bits) { \
2619 const size_t N = Lanes(d); \
2620 OP##_v_b##MLEN(bits, m, N); \
2621 /* Non-full byte, need to clear the undefined upper bits. */ \
2622 /* Use MaxLanes and sizeof(T) to move some checks to compile-time. */ \
2623 constexpr bool kLessThan8 = \
2624 detail::ScaleByPower(16 / sizeof(TFromD<D>), Pow2(d)) < 8; \
2625 if (MaxLanes(d) < 8 || (kLessThan8 && N < 8)) { \
2626 const int mask = (1 << N) - 1; \
2627 bits[0] = static_cast<uint8_t>(bits[0] & mask); \
2628 } \
2629 return (N + 7) / 8; \
2630 }
2632#undef HWY_RVV_STORE_MASK_BITS
2633
2634// ------------------------------ CompressBits, CompressBitsStore (LoadMaskBits)
2635
2636template <class V>
2637HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
2638 return Compress(v, LoadMaskBits(DFromV<V>(), bits));
2639}
2640
2641template <class D>
2642HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
2643 D d, TFromD<D>* HWY_RESTRICT unaligned) {
2644 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
2645}
2646
2647// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp)
2648
2649// Disallow for 8-bit because Iota is likely to overflow.
2650template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
2651HWY_API MFromD<D> FirstN(const D d, const size_t n) {
2652 const RebindToSigned<D> di;
2653 using TI = TFromD<decltype(di)>;
2654 return RebindMask(
2655 d, detail::LtS(BitCast(di, detail::Iota0(d)), static_cast<TI>(n)));
2656}
2657
2658template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
2659HWY_API MFromD<D> FirstN(const D d, const size_t n) {
2660 const auto zero = Zero(d);
2661 const auto one = Set(d, 1);
2662 return Eq(detail::SlideUp(one, zero, n), one);
2663}
2664
2665// ------------------------------ Neg (Sub)
2666
2667template <class V, HWY_IF_SIGNED_V(V)>
2668HWY_API V Neg(const V v) {
2669 return detail::ReverseSubS(v, 0);
2670}
2671
2672// vector = f(vector), but argument is repeated
2673#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2674 SHIFT, MLEN, NAME, OP) \
2675 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
2676 return v##OP##_vv_##CHAR##SEW##LMUL(v, v, HWY_RVV_AVL(SEW, SHIFT)); \
2677 }
2678
2680
2681// ------------------------------ Abs (Max, Neg)
2682
2683template <class V, HWY_IF_SIGNED_V(V)>
2684HWY_API V Abs(const V v) {
2685 return Max(v, Neg(v));
2686}
2687
2689
2690#undef HWY_RVV_RETV_ARGV2
2691
2692// ------------------------------ AbsDiff (Abs, Sub)
2693template <class V>
2694HWY_API V AbsDiff(const V a, const V b) {
2695 return Abs(Sub(a, b));
2696}
2697
2698// ------------------------------ Round (NearestInt, ConvertTo, CopySign)
2699
2700// IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not have
2701// a dedicated instruction for that. Rounding to integer and converting back to
2702// float is correct except when the input magnitude is large, in which case the
2703// input was already an integer (because mantissa >> exponent is zero).
2704
2705namespace detail {
2706enum RoundingModes { kNear, kTrunc, kDown, kUp };
2707
2708template <class V>
2709HWY_INLINE auto UseInt(const V v) -> decltype(MaskFromVec(v)) {
2710 return detail::LtS(Abs(v), MantissaEnd<TFromV<V>>());
2711}
2712
2713} // namespace detail
2714
2715template <class V>
2716HWY_API V Round(const V v) {
2717 const DFromV<V> df;
2718
2719 const auto integer = NearestInt(v); // round using current mode
2720 const auto int_f = ConvertTo(df, integer);
2721
2722 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
2723}
2724
2725// ------------------------------ Trunc (ConvertTo)
2726template <class V>
2727HWY_API V Trunc(const V v) {
2728 const DFromV<V> df;
2729 const RebindToSigned<decltype(df)> di;
2730
2731 const auto integer = ConvertTo(di, v); // round toward 0
2732 const auto int_f = ConvertTo(df, integer);
2733
2734 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
2735}
2736
2737// ------------------------------ Ceil
2738template <class V>
2739HWY_API V Ceil(const V v) {
2740 asm volatile("fsrm %0" ::"r"(detail::kUp));
2741 const auto ret = Round(v);
2742 asm volatile("fsrm %0" ::"r"(detail::kNear));
2743 return ret;
2744}
2745
2746// ------------------------------ Floor
2747template <class V>
2748HWY_API V Floor(const V v) {
2749 asm volatile("fsrm %0" ::"r"(detail::kDown));
2750 const auto ret = Round(v);
2751 asm volatile("fsrm %0" ::"r"(detail::kNear));
2752 return ret;
2753}
2754
2755// ------------------------------ Floating-point classification (Ne)
2756
2757// vfclass does not help because it would require 3 instructions (to AND and
2758// then compare the bits), whereas these are just 1-3 integer instructions.
2759
2760template <class V>
2762 return Ne(v, v);
2763}
2764
2765template <class V, class D = DFromV<V>>
2767 const D d;
2768 const RebindToSigned<decltype(d)> di;
2769 using T = TFromD<D>;
2770 const VFromD<decltype(di)> vi = BitCast(di, v);
2771 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
2772 return RebindMask(d, detail::EqS(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
2773}
2774
2775// Returns whether normal/subnormal/zero.
2776template <class V, class D = DFromV<V>>
2778 const D d;
2779 const RebindToUnsigned<decltype(d)> du;
2780 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
2781 using T = TFromD<D>;
2782 const VFromD<decltype(du)> vu = BitCast(du, v);
2783 // 'Shift left' to clear the sign bit, then right so we can compare with the
2784 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
2785 // negative and non-negative floats would be greater).
2786 const VFromD<decltype(di)> exp =
2787 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
2788 return RebindMask(d, detail::LtS(exp, hwy::MaxExponentField<T>()));
2789}
2790
2791// ------------------------------ Iota (ConvertTo)
2792
2793template <class D, HWY_IF_UNSIGNED_D(D)>
2794HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
2795 return detail::AddS(detail::Iota0(d), first);
2796}
2797
2798template <class D, HWY_IF_SIGNED_D(D)>
2799HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
2800 const RebindToUnsigned<D> du;
2801 return detail::AddS(BitCast(d, detail::Iota0(du)), first);
2802}
2803
2804template <class D, HWY_IF_FLOAT_D(D)>
2805HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
2806 const RebindToUnsigned<D> du;
2807 const RebindToSigned<D> di;
2808 return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))), first);
2809}
2810
2811// ------------------------------ MulEven/Odd (Mul, OddEven)
2812
2813template <class V, HWY_IF_LANE_SIZE_V(V, 4), class D = DFromV<V>,
2814 class DW = RepartitionToWide<D>>
2815HWY_API VFromD<DW> MulEven(const V a, const V b) {
2816 const auto lo = Mul(a, b);
2817 const auto hi = detail::MulHigh(a, b);
2818 return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo));
2819}
2820
2821// There is no 64x64 vwmul.
2822template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2823HWY_INLINE V MulEven(const V a, const V b) {
2824 const auto lo = detail::Mul(a, b);
2825 const auto hi = detail::MulHigh(a, b);
2826 return OddEven(detail::Slide1Up(hi), lo);
2827}
2828
2829template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2830HWY_INLINE V MulOdd(const V a, const V b) {
2831 const auto lo = detail::Mul(a, b);
2832 const auto hi = detail::MulHigh(a, b);
2833 return OddEven(hi, detail::Slide1Down(lo));
2834}
2835
2836// ------------------------------ ReorderDemote2To (OddEven)
2837
2838template <size_t N, int kPow2>
2841 VFromD<RepartitionToWide<decltype(dbf16)>> a,
2842 VFromD<RepartitionToWide<decltype(dbf16)>> b) {
2843 const RebindToUnsigned<decltype(dbf16)> du16;
2844 const RebindToUnsigned<DFromV<decltype(a)>> du32;
2845 const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b));
2846 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2847}
2848
2849// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2850
2851template <class DF>
2853
2854template <size_t N, int kPow2>
2856 VFromD<DU16FromDF<decltype(df32)>> a,
2857 VFromD<DU16FromDF<decltype(df32)>> b,
2858 const VFromD<decltype(df32)> sum0,
2859 VFromD<decltype(df32)>& sum1)
2860 -> VFromD<decltype(df32)> {
2861 const DU16FromDF<decltype(df32)> du16;
2862 const RebindToUnsigned<decltype(df32)> du32;
2863 using VU32 = VFromD<decltype(du32)>;
2864 const VFromD<decltype(du16)> zero = Zero(du16);
2865 const VU32 a0 = ZipLower(du32, zero, BitCast(du16, a));
2866 const VU32 a1 = ZipUpper(du32, zero, BitCast(du16, a));
2867 const VU32 b0 = ZipLower(du32, zero, BitCast(du16, b));
2868 const VU32 b1 = ZipUpper(du32, zero, BitCast(du16, b));
2869 sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
2870 return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
2871}
2872
2873// ------------------------------ Lt128
2874template <class D>
2876 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
2877 // Truth table of Eq and Compare for Hi and Lo u64.
2878 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
2879 // =H =L cH cL | out = cH | (=H & cL)
2880 // 0 0 0 0 | 0
2881 // 0 0 0 1 | 0
2882 // 0 0 1 0 | 1
2883 // 0 0 1 1 | 1
2884 // 0 1 0 0 | 0
2885 // 0 1 0 1 | 0
2886 // 0 1 1 0 | 1
2887 // 1 0 0 0 | 0
2888 // 1 0 0 1 | 1
2889 // 1 1 0 0 | 0
2890 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
2891 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
2892 // Shift leftward so L can influence H.
2893 const VFromD<D> ltLx = detail::Slide1Up(ltHL);
2894 const VFromD<D> vecHx = OrAnd(ltHL, eqHL, ltLx);
2895 // Replicate H to its neighbor.
2896 return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx)));
2897}
2898
2899// ------------------------------ Lt128Upper
2900template <class D>
2902 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
2903 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
2904 // Replicate H to its neighbor.
2905 return MaskFromVec(OddEven(ltHL, detail::Slide1Down(ltHL)));
2906}
2907
2908// ------------------------------ Min128, Max128 (Lt128)
2909
2910template <class D>
2911HWY_INLINE VFromD<D> Min128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
2912 const VFromD<D> aXH = detail::Slide1Down(a);
2913 const VFromD<D> bXH = detail::Slide1Down(b);
2914 const VFromD<D> minHL = Min(a, b);
2915 const MFromD<D> ltXH = Lt(aXH, bXH);
2916 const MFromD<D> eqXH = Eq(aXH, bXH);
2917 // If the upper lane is the decider, take lo from the same reg.
2918 const VFromD<D> lo = IfThenElse(ltXH, a, b);
2919 // The upper lane is just minHL; if they are equal, we also need to use the
2920 // actual min of the lower lanes.
2921 return OddEven(minHL, IfThenElse(eqXH, minHL, lo));
2922}
2923
2924template <class D>
2925HWY_INLINE VFromD<D> Max128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
2926 const VFromD<D> aXH = detail::Slide1Down(a);
2927 const VFromD<D> bXH = detail::Slide1Down(b);
2928 const VFromD<D> maxHL = Max(a, b);
2929 const MFromD<D> ltXH = Lt(aXH, bXH);
2930 const MFromD<D> eqXH = Eq(aXH, bXH);
2931 // If the upper lane is the decider, take lo from the same reg.
2932 const VFromD<D> lo = IfThenElse(ltXH, b, a);
2933 // The upper lane is just maxHL; if they are equal, we also need to use the
2934 // actual min of the lower lanes.
2935 return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo));
2936}
2937
2938template <class D>
2939HWY_INLINE VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
2940 return IfThenElse(Lt128Upper(d, a, b), a, b);
2941}
2942
2943template <class D>
2944HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
2945 return IfThenElse(Lt128Upper(d, b, a), a, b);
2946}
2947
2948// ================================================== END MACROS
2949namespace detail { // for code folding
2950#undef HWY_RVV_AVL
2951#undef HWY_RVV_D
2952#undef HWY_RVV_FOREACH
2953#undef HWY_RVV_FOREACH_08_ALL
2954#undef HWY_RVV_FOREACH_08_ALL_VIRT
2955#undef HWY_RVV_FOREACH_08_DEMOTE
2956#undef HWY_RVV_FOREACH_08_DEMOTE_VIRT
2957#undef HWY_RVV_FOREACH_08_EXT
2958#undef HWY_RVV_FOREACH_08_EXT_VIRT
2959#undef HWY_RVV_FOREACH_08_TRUNC
2960#undef HWY_RVV_FOREACH_08_VIRT
2961#undef HWY_RVV_FOREACH_16_ALL
2962#undef HWY_RVV_FOREACH_16_ALL_VIRT
2963#undef HWY_RVV_FOREACH_16_DEMOTE
2964#undef HWY_RVV_FOREACH_16_DEMOTE_VIRT
2965#undef HWY_RVV_FOREACH_16_EXT
2966#undef HWY_RVV_FOREACH_16_EXT_VIRT
2967#undef HWY_RVV_FOREACH_16_TRUNC
2968#undef HWY_RVV_FOREACH_16_VIRT
2969#undef HWY_RVV_FOREACH_32_ALL
2970#undef HWY_RVV_FOREACH_32_ALL_VIRT
2971#undef HWY_RVV_FOREACH_32_DEMOTE
2972#undef HWY_RVV_FOREACH_32_DEMOTE_VIRT
2973#undef HWY_RVV_FOREACH_32_EXT
2974#undef HWY_RVV_FOREACH_32_EXT_VIRT
2975#undef HWY_RVV_FOREACH_32_TRUNC
2976#undef HWY_RVV_FOREACH_32_VIRT
2977#undef HWY_RVV_FOREACH_64_ALL
2978#undef HWY_RVV_FOREACH_64_ALL_VIRT
2979#undef HWY_RVV_FOREACH_64_DEMOTE
2980#undef HWY_RVV_FOREACH_64_DEMOTE_VIRT
2981#undef HWY_RVV_FOREACH_64_EXT
2982#undef HWY_RVV_FOREACH_64_EXT_VIRT
2983#undef HWY_RVV_FOREACH_64_TRUNC
2984#undef HWY_RVV_FOREACH_64_VIRT
2985#undef HWY_RVV_FOREACH_B
2986#undef HWY_RVV_FOREACH_F
2987#undef HWY_RVV_FOREACH_F16
2988#undef HWY_RVV_FOREACH_F32
2989#undef HWY_RVV_FOREACH_F3264
2990#undef HWY_RVV_FOREACH_F64
2991#undef HWY_RVV_FOREACH_I
2992#undef HWY_RVV_FOREACH_I08
2993#undef HWY_RVV_FOREACH_I16
2994#undef HWY_RVV_FOREACH_I163264
2995#undef HWY_RVV_FOREACH_I32
2996#undef HWY_RVV_FOREACH_I64
2997#undef HWY_RVV_FOREACH_U
2998#undef HWY_RVV_FOREACH_U08
2999#undef HWY_RVV_FOREACH_U16
3000#undef HWY_RVV_FOREACH_U163264
3001#undef HWY_RVV_FOREACH_U32
3002#undef HWY_RVV_FOREACH_U64
3003#undef HWY_RVV_FOREACH_UI
3004#undef HWY_RVV_FOREACH_UI08
3005#undef HWY_RVV_FOREACH_UI16
3006#undef HWY_RVV_FOREACH_UI163264
3007#undef HWY_RVV_FOREACH_UI32
3008#undef HWY_RVV_FOREACH_UI3264
3009#undef HWY_RVV_FOREACH_UI64
3010#undef HWY_RVV_M
3011#undef HWY_RVV_RETM_ARGM
3012#undef HWY_RVV_RETV_ARGV
3013#undef HWY_RVV_RETV_ARGVS
3014#undef HWY_RVV_RETV_ARGVV
3015#undef HWY_RVV_T
3016#undef HWY_RVV_V
3017} // namespace detail
3018// NOLINTNEXTLINE(google-readability-namespace-comments)
3019} // namespace HWY_NAMESPACE
3020} // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DASSERT(condition)
Definition: base.h:191
HWY_INLINE VFromD< DU > BitCastToUnsigned(V v)
Definition: rvv-inl.h:691
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition: rvv-inl.h:1817
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:1937
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3345
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_INLINE auto ChangeLMUL(Simd< T, N, kPow2 > d, VFromD< Simd< T, N, kPow2 - 3 > > v) -> VFromD< decltype(d)>
Definition: rvv-inl.h:2300
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition: ops/shared-inl.h:111
constexpr bool IsSupportedLMUL(D d)
Definition: rvv-inl.h:1897
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition: ops/shared-inl.h:103
HWY_INLINE MFromD< D > FirstNPerBlock(D)
Definition: rvv-inl.h:1823
HWY_INLINE VFromD< DU > Iota0(const D)
Definition: rvv-inl.h:712
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:1155
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
RepartitionToNarrow< RebindToUnsigned< DF > > DU16FromDF
Definition: rvv-inl.h:2852
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition: ops/shared-inl.h:276
HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, vfncvt_rod_f_f_w_f, _DEMOTE_VIRT) template< size_t N > HWY_API vint32mf2_t DemoteTo(Simd< int32_t
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
constexpr size_t MLenFromD(Simd< T, N, kPow2 >)
Definition: rvv-inl.h:43
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:161
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D)
Definition: ops/shared-inl.h:252
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API V Trunc(const V v)
Definition: rvv-inl.h:2727
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition: rvv-inl.h:2238
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
constexpr T MantissaEnd()
Definition: base.h:631
HWY_API constexpr bool IsSame()
Definition: base.h:322
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:309
constexpr size_t CeilLog2(TI x)
Definition: base.h:777
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
#define HWY_IF_LANE_SIZE_D(D, bytes)
Definition: ops/shared-inl.h:235
#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:344
#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1399
HWY_AFTER_NAMESPACE()
#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition: rvv-inl.h:1561
#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:567
#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1020
#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:339
#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:508
#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1302
#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:299
#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1370
#define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:353
#define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1435
#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:610
#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:293
#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:370
#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1328
#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition: rvv-inl.h:1568
#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2508
#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:626
#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1935
#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1790
#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1341
#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:539
#define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:323
#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:912
#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:992
#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1619
#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:409
#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1552
#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:903
#define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1483
#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:848
#define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:327
#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1167
#define HWY_RVV_IF_POW2_IN(D, min, max)
Definition: rvv-inl.h:39
#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1115
#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2056
#define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1450
#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP)
Definition: rvv-inl.h:59
#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:447
#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:311
#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:301
#define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1513
#define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:349
#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:463
#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:440
#define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1498
#define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:394
#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1240
#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:287
#define HWY_RVV_SLIDE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1831
#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2182
HWY_BEFORE_NAMESPACE()
#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:379
#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:472
#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1724
#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:530
#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:313
#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:581
#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1096
#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:364
#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1266
#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1639
#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:291
#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:701
#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:642
#define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:335
#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1135
#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1466
#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1222
#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:552
#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:596
#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1315
#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1950
#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1029
#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:358
#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:375
#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1772
#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:455
#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:289
#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1252
#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:297
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: ops/shared-inl.h:40
Definition: base.h:358
Definition: base.h:251
uint16_t bits
Definition: base.h:252