Grok 10.0.3
ops/shared-inl.h
Go to the documentation of this file.
1// Copyright 2020 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Per-target definitions shared by ops/*.h and user code.
17
18#include <cmath>
19
20#include "hwy/base.h"
21
22// Separate header because foreach_target.h re-enables its include guard.
24
25// Relies on the external include guard in highway.h.
27namespace hwy {
28namespace HWY_NAMESPACE {
29
30// Highway operations are implemented as overloaded functions selected using an
31// internal-only tag type D := Simd<T, N, kPow2>. T is the lane type. kPow2 is a
32// shift count applied to scalable vectors. Instead of referring to Simd<>
33// directly, users create D via aliases ScalableTag<T[, kPow2]>() (defaults to a
34// full vector, or fractions/groups if the argument is negative/positive),
35// CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes is
36// Lanes(D()), a power of two. For scalable vectors, N is either HWY_LANES or a
37// cap. For constexpr-size vectors, N is the actual number of lanes. This
38// ensures Half<Full512<T>> is the same type as Full256<T>, as required by x86.
39template <typename Lane, size_t N, int kPow2>
40struct Simd {
41 constexpr Simd() = default;
42 using T = Lane;
43 static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
44
45 // Only for use by MaxLanes, required by MSVC. Cannot be enum because GCC
46 // warns when using enums and non-enums in the same expression. Cannot be
47 // static constexpr function (another MSVC limitation).
48 static constexpr size_t kPrivateN = N;
49 static constexpr int kPrivatePow2 = kPow2;
50
51 template <typename NewT>
52 static constexpr size_t NewN() {
53 // Round up to correctly handle scalars with N=1.
54 return (N * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
55 }
56
57#if HWY_HAVE_SCALABLE
58 template <typename NewT>
59 static constexpr int Pow2Ratio() {
60 return (sizeof(NewT) > sizeof(T))
61 ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
62 : -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT)));
63 }
64#endif
65
66 // Widening/narrowing ops change the number of lanes and/or their type.
67 // To initialize such vectors, we need the corresponding tag types:
68
69// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
70#if HWY_HAVE_SCALABLE
71 template <typename NewT>
72 using Rebind = Simd<NewT, N, kPow2 + Pow2Ratio<NewT>()>;
73#else
74 template <typename NewT>
76#endif
77
78 // Change lane type while keeping the same vector size, e.g. for MulEven.
79 template <typename NewT>
81
82// Half the lanes while keeping the same lane type, e.g. for LowerHalf.
83// Round up to correctly handle scalars with N=1.
84#if HWY_HAVE_SCALABLE
85 // Reducing the cap (N) is required for SVE - if N is the limiter for f32xN,
86 // then we expect Half<Rebind<u16>> to have N/2 lanes (rounded up).
87 using Half = Simd<T, (N + 1) / 2, kPow2 - 1>;
88#else
89 using Half = Simd<T, (N + 1) / 2, kPow2>;
90#endif
91
92// Twice the lanes while keeping the same lane type, e.g. for Combine.
93#if HWY_HAVE_SCALABLE
95#else
97#endif
98};
99
100namespace detail {
101
102template <typename T, size_t N, int kPow2>
103constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
104 return N == HWY_LANES(T) && kPow2 == 0;
105}
106
107// Returns the number of lanes (possibly zero) after applying a shift:
108// - 0: no change;
109// - [1,3]: a group of 2,4,8 [fractional] vectors;
110// - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
111constexpr size_t ScaleByPower(size_t N, int pow2) {
112#if HWY_TARGET == HWY_RVV
113 return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
114#else
115 return pow2 >= 0 ? N : (N >> (-pow2));
116#endif
117}
118
119// Struct wrappers enable validation of arguments via static_assert.
120template <typename T, int kPow2>
122 static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8");
123#if HWY_TARGET == HWY_RVV
124 // Only RVV supports register groups.
125 using type = Simd<T, HWY_LANES(T), kPow2>;
126#elif HWY_HAVE_SCALABLE
127 // For SVE[2], only allow full or fractions.
128 using type = Simd<T, HWY_LANES(T), HWY_MIN(kPow2, 0)>;
129#elif HWY_TARGET == HWY_SCALAR
130 using type = Simd<T, /*N=*/1, 0>;
131#else
132 // Only allow full or fractions.
133 using type = Simd<T, ScaleByPower(HWY_LANES(T), HWY_MIN(kPow2, 0)), 0>;
134#endif
135};
136
137template <typename T, size_t kLimit>
139 static_assert(kLimit != 0, "Does not make sense to have zero lanes");
140 // Safely handle non-power-of-two inputs by rounding down, which is allowed by
141 // CappedTag. Otherwise, Simd<T, 3, 0> would static_assert.
142 static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit);
143 using type = Simd<T, HWY_MIN(kLimitPow2, HWY_LANES(T)), 0>;
144};
145
146template <typename T, size_t kNumLanes>
148 static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
149 static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes");
151};
152
153} // namespace detail
154
155// Alias for a tag describing a full vector (kPow2 == 0: the most common usage,
156// e.g. 1D loops where the application does not care about the vector size) or a
157// fraction/multiple of one. Multiples are the same as full vectors for all
158// targets except RVV. Fractions (kPow2 < 0) are useful as the argument/return
159// value of type promotion and demotion.
160template <typename T, int kPow2 = 0>
162
163// Alias for a tag describing a vector with *up to* kLimit active lanes, even on
164// targets with scalable vectors and HWY_SCALAR. The runtime lane count
165// `Lanes(tag)` may be less than kLimit, and is 1 on HWY_SCALAR. This alias is
166// typically used for 1D loops with a relatively low application-defined upper
167// bound, e.g. for 8x8 DCTs. However, it is better if data structures are
168// designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
169// chunks of `M >= MaxLanes(d)` DC components followed by M AC1, .., and M AC63;
170// this would enable vector-length-agnostic loops using ScalableTag).
171template <typename T, size_t kLimit>
173
174// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
175// even on targets with scalable vectors. Requires `kNumLanes` to be a power of
176// two not exceeding `HWY_LANES(T)`.
177//
178// NOTE: if the application does not need to support HWY_SCALAR (+), use this
179// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
180// This is useful for data structures that rely on exactly 128-bit SIMD, but
181// these are discouraged because they cannot benefit from wider vectors.
182// Instead, applications would ideally define a larger problem size and loop
183// over it with the (unknown size) vectors from ScalableTag.
184//
185// + e.g. if the baseline is known to support SIMD, or the application requires
186// ops such as TableLookupBytes not supported by HWY_SCALAR.
187template <typename T, size_t kNumLanes>
189
190template <class D>
191using TFromD = typename D::T;
192
193// Tag for the same number of lanes as D, but with the LaneType T.
194template <class T, class D>
195using Rebind = typename D::template Rebind<T>;
196
197template <class D>
199template <class D>
201template <class D>
203
204// Tag for the same total size as D, but with the LaneType T.
205template <class T, class D>
206using Repartition = typename D::template Repartition<T>;
207
208template <class D>
210template <class D>
212
213// Tag for the same lane type as D, but half the lanes.
214template <class D>
215using Half = typename D::Half;
216
217// Tag for the same lane type as D, but twice the lanes.
218template <class D>
219using Twice = typename D::Twice;
220
221template <typename T>
222using Full32 = Simd<T, 4 / sizeof(T), 0>;
223
224template <typename T>
225using Full64 = Simd<T, 8 / sizeof(T), 0>;
226
227template <typename T>
228using Full128 = Simd<T, 16 / sizeof(T), 0>;
229
230// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
231#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
232#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
233#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
234#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>)
235#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
236#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
237
238// MSVC workaround: use PrivateN directly instead of MaxLanes.
239#define HWY_IF_LT128_D(D) \
240 hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) < 16>* = nullptr
241#define HWY_IF_GE128_D(D) \
242 hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr
243
244// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
245#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
246#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
247#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
248#define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
249#define HWY_IF_NOT_LANE_SIZE_V(V, bytes) HWY_IF_NOT_LANE_SIZE(TFromV<V>, bytes)
250
251template <class D>
252HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
253 return D::kPrivatePow2;
254}
255
256// MSVC requires the explicit <D>.
257#define HWY_IF_POW2_GE(D, MIN) hwy::EnableIf<Pow2<D>(D()) >= (MIN)>* = nullptr
258
259#if HWY_HAVE_SCALABLE
260
261// Upper bound on the number of lanes. Intended for template arguments and
262// reducing code size (e.g. for SSE4, we know at compile-time that vectors will
263// not exceed 16 bytes). WARNING: this may be a loose bound, use Lanes() as the
264// actual size for allocating storage. WARNING: MSVC might not be able to deduce
265// arguments if this is used in EnableIf. See HWY_IF_LT128_D above.
266template <class D>
267HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
268 return detail::ScaleByPower(HWY_MIN(D::kPrivateN, HWY_LANES(TFromD<D>)),
269 D::kPrivatePow2);
270}
271
272#else
273// Workaround for MSVC 2017: T,N,kPow2 argument deduction fails, so returning N
274// is not an option, nor does a member function work.
275template <class D>
277 return D::kPrivateN;
278}
279
280// (Potentially) non-constant actual size of the vector at runtime, subject to
281// the limit imposed by the Simd. Useful for advancing loop counters.
282// Targets with scalable vectors define this themselves.
283template <typename T, size_t N, int kPow2>
285 return N;
286}
287
288#endif // !HWY_HAVE_SCALABLE
289
290// NOTE: GCC generates incorrect code for vector arguments to non-inlined
291// functions in two situations:
292// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
293// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
294// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
295// all) tests to fail.
296//
297// We therefore pass by const& only on GCC and (Windows or ARM64). This alias
298// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
299// and possibly also other functions that are not inlined.
300#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \
301 ((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM_A64)
302template <class V>
303using VecArg = const V&;
304#else
305template <class V>
306using VecArg = V;
307#endif
308
309// NOLINTNEXTLINE(google-readability-namespace-comments)
310} // namespace HWY_NAMESPACE
311} // namespace hwy
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_MAYBE_UNUSED
Definition: base.h:73
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition: ops/shared-inl.h:111
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition: ops/shared-inl.h:103
V VecArg
Definition: ops/shared-inl.h:306
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition: ops/shared-inl.h:276
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
typename detail::CappedTagChecker< T, kLimit >::type CappedTag
Definition: ops/shared-inl.h:172
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:202
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:161
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D)
Definition: ops/shared-inl.h:252
typename detail::FixedTagChecker< T, kNumLanes >::type FixedTag
Definition: ops/shared-inl.h:188
typename D::Half Half
Definition: ops/shared-inl.h:215
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
N
Definition: rvv-inl.h:1742
typename D::T TFromD
Definition: ops/shared-inl.h:191
Definition: aligned_allocator.h:27
constexpr size_t FloorLog2(TI x)
Definition: base.h:770
constexpr size_t CeilLog2(TI x)
Definition: base.h:777
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
#define HWY_LANES(T)
Definition: set_macros-inl.h:85
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: ops/shared-inl.h:40
constexpr Simd()=default
Simd< NewT, N, kPow2 > Rebind
Definition: ops/shared-inl.h:75
static constexpr size_t NewN()
Definition: ops/shared-inl.h:52
static constexpr int kPrivatePow2
Definition: ops/shared-inl.h:49
static constexpr size_t kPrivateN
Definition: ops/shared-inl.h:48
Lane T
Definition: ops/shared-inl.h:42
Definition: ops/shared-inl.h:138
static constexpr size_t kLimitPow2
Definition: ops/shared-inl.h:142
Definition: ops/shared-inl.h:147
Definition: ops/shared-inl.h:121