Grok 10.0.5
arm_neon-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 128-bit ARM64 NEON vectors and operations.
17// External include guard in highway.h - see comment there.
18
19// ARM NEON intrinsics are documented at:
20// https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]
21
22#include <stddef.h>
23#include <stdint.h>
24
25#include "hwy/ops/shared-inl.h"
26
28
29// Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with
30// the same target attribute as our code, see #834.
32HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
33#include <arm_neon.h> // NOLINT(build/include_order)
35
36// Must come after arm_neon.h.
37namespace hwy {
38namespace HWY_NAMESPACE {
39
40namespace detail { // for code folding and Raw128
41
42// Macros used to define single and double function calls for multiple types
43// for full and half vectors. These macros are undefined at the end of the file.
44
45// HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function.
49
50// HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can
51// extend it to int32x4x2_t packs.
52#define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
53#define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
54#define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>
55
56// HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives.
57#define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
58#define HWY_NEON_BUILD_PARAM_2(type, size) \
59 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
60#define HWY_NEON_BUILD_PARAM_3(type, size) \
61 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
62 const Vec128<type##_t, size> c
63
64// HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying
65// function.
67#define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
68#define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
69
70// We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after
71// the __VA_ARGS__ have been expanded. This allows "func" to be a macro on
72// itself like with some of the library "functions" such as vshlq_u8. For
73// example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as
74// "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed.
75// Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro
76// expects two arguments.
77#define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
78
79// Main macro definition that defines a single function for the given type and
80// size of vector, using the underlying (prefix##infix##suffix) function and
81// the template, return type, parameters and arguments defined by the "args"
82// parameters passed here (see HWY_NEON_BUILD_* macros defined before).
83#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
84 HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \
85 HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \
86 name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
87 return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
88 HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
89 }
90
91// The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function
92// called "name" using the set of neon functions starting with the given
93// "prefix" for all the variants of certain types, as specified next to each
94// macro. For example, the prefix "vsub" can be used to define the operator-
95// using args=2.
96
97// uint8_t
98#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
99 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
100 HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \
101 HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \
102 HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \
103 HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)
104
105// int8_t
106#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
107 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
108 HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \
109 HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \
110 HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \
111 HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)
112
113// uint16_t
114#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
115 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
116 HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \
117 HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \
118 HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)
119
120// int16_t
121#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
122 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
123 HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \
124 HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \
125 HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)
126
127// uint32_t
128#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
129 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
130 HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \
131 HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)
132
133// int32_t
134#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
135 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
136 HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \
137 HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)
138
139// uint64_t
140#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
141 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
142 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
143
144// int64_t
145#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
146 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
147 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
148
149// float
150#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
151 HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
152 HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \
153 HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)
154
155// double
156#if HWY_ARCH_ARM_A64
157#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \
158 HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
159 HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
160#else
161#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
162#endif
163
164// float and double
165
166#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
167 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
168 HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
169
170// Helper macros to define for more than one type.
171// uint8_t, uint16_t and uint32_t
172#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
173 HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
174 HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
175 HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
176
177// int8_t, int16_t and int32_t
178#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
179 HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
180 HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
181 HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
182
183// uint8_t, uint16_t, uint32_t and uint64_t
184#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
185 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
186 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
187
188// int8_t, int16_t, int32_t and int64_t
189#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
190 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
191 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
192
193// All int*_t and uint*_t up to 64
194#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
195 HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
196 HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
197
198// All previous types.
199#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
200 HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
201 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
202
203#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args) \
204 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
205 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
206 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
207
208// For eor3q, which is only defined for full vectors.
209#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args) \
210 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
211 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
212 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
213 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
214 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
215 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
216 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
217 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args)
218
219// Emulation of some intrinsics on armv7.
220#if HWY_ARCH_ARM_V7
221#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
222#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
223#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
224#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
225#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
226#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
227#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
228#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
229#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
230#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
231#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
232#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
233#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
234#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
235#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
236#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
237#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
238#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
239#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
240#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
241#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
242#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
243#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
244#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
245#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
246#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
247#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
248#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
249#define vzip1_s8(x, y) vzip_s8(x, y).val[0]
250#define vzip1_u8(x, y) vzip_u8(x, y).val[0]
251#define vzip1_s16(x, y) vzip_s16(x, y).val[0]
252#define vzip1_u16(x, y) vzip_u16(x, y).val[0]
253#define vzip1_f32(x, y) vzip_f32(x, y).val[0]
254#define vzip1_u32(x, y) vzip_u32(x, y).val[0]
255#define vzip1_s32(x, y) vzip_s32(x, y).val[0]
256#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
257#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
258#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
259#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
260#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
261#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
262#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
263#define vzip2_s8(x, y) vzip_s8(x, y).val[1]
264#define vzip2_u8(x, y) vzip_u8(x, y).val[1]
265#define vzip2_s16(x, y) vzip_s16(x, y).val[1]
266#define vzip2_u16(x, y) vzip_u16(x, y).val[1]
267#define vzip2_s32(x, y) vzip_s32(x, y).val[1]
268#define vzip2_u32(x, y) vzip_u32(x, y).val[1]
269#define vzip2_f32(x, y) vzip_f32(x, y).val[1]
270#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
271#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
272#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
273#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
274#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
275#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
276#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
277#endif
278
279// Wrappers over uint8x16x2_t etc. so we can define StoreInterleaved2 overloads
280// for all vector types, even those (bfloat16_t) where the underlying vector is
281// the same as others (uint16_t).
282template <typename T, size_t N>
283struct Tuple2;
284template <typename T, size_t N>
285struct Tuple3;
286template <typename T, size_t N>
287struct Tuple4;
288
289template <>
290struct Tuple2<uint8_t, 16> {
291 uint8x16x2_t raw;
292};
293template <size_t N>
294struct Tuple2<uint8_t, N> {
295 uint8x8x2_t raw;
296};
297template <>
298struct Tuple2<int8_t, 16> {
299 int8x16x2_t raw;
300};
301template <size_t N>
302struct Tuple2<int8_t, N> {
303 int8x8x2_t raw;
304};
305template <>
306struct Tuple2<uint16_t, 8> {
307 uint16x8x2_t raw;
308};
309template <size_t N>
310struct Tuple2<uint16_t, N> {
311 uint16x4x2_t raw;
312};
313template <>
314struct Tuple2<int16_t, 8> {
315 int16x8x2_t raw;
316};
317template <size_t N>
318struct Tuple2<int16_t, N> {
319 int16x4x2_t raw;
320};
321template <>
322struct Tuple2<uint32_t, 4> {
323 uint32x4x2_t raw;
324};
325template <size_t N>
326struct Tuple2<uint32_t, N> {
327 uint32x2x2_t raw;
328};
329template <>
330struct Tuple2<int32_t, 4> {
331 int32x4x2_t raw;
332};
333template <size_t N>
334struct Tuple2<int32_t, N> {
335 int32x2x2_t raw;
336};
337template <>
338struct Tuple2<uint64_t, 2> {
339 uint64x2x2_t raw;
340};
341template <size_t N>
342struct Tuple2<uint64_t, N> {
343 uint64x1x2_t raw;
344};
345template <>
346struct Tuple2<int64_t, 2> {
347 int64x2x2_t raw;
348};
349template <size_t N>
350struct Tuple2<int64_t, N> {
351 int64x1x2_t raw;
352};
353
354template <>
355struct Tuple2<float16_t, 8> {
356 uint16x8x2_t raw;
357};
358template <size_t N>
360 uint16x4x2_t raw;
361};
362template <>
363struct Tuple2<bfloat16_t, 8> {
364 uint16x8x2_t raw;
365};
366template <size_t N>
368 uint16x4x2_t raw;
369};
370
371template <>
372struct Tuple2<float32_t, 4> {
373 float32x4x2_t raw;
374};
375template <size_t N>
377 float32x2x2_t raw;
378};
379#if HWY_ARCH_ARM_A64
380template <>
381struct Tuple2<float64_t, 2> {
382 float64x2x2_t raw;
383};
384template <size_t N>
385struct Tuple2<float64_t, N> {
386 float64x1x2_t raw;
387};
388#endif // HWY_ARCH_ARM_A64
389
390template <>
391struct Tuple3<uint8_t, 16> {
392 uint8x16x3_t raw;
393};
394template <size_t N>
395struct Tuple3<uint8_t, N> {
396 uint8x8x3_t raw;
397};
398template <>
399struct Tuple3<int8_t, 16> {
400 int8x16x3_t raw;
401};
402template <size_t N>
403struct Tuple3<int8_t, N> {
404 int8x8x3_t raw;
405};
406template <>
407struct Tuple3<uint16_t, 8> {
408 uint16x8x3_t raw;
409};
410template <size_t N>
411struct Tuple3<uint16_t, N> {
412 uint16x4x3_t raw;
413};
414template <>
415struct Tuple3<int16_t, 8> {
416 int16x8x3_t raw;
417};
418template <size_t N>
419struct Tuple3<int16_t, N> {
420 int16x4x3_t raw;
421};
422template <>
423struct Tuple3<uint32_t, 4> {
424 uint32x4x3_t raw;
425};
426template <size_t N>
427struct Tuple3<uint32_t, N> {
428 uint32x2x3_t raw;
429};
430template <>
431struct Tuple3<int32_t, 4> {
432 int32x4x3_t raw;
433};
434template <size_t N>
435struct Tuple3<int32_t, N> {
436 int32x2x3_t raw;
437};
438template <>
439struct Tuple3<uint64_t, 2> {
440 uint64x2x3_t raw;
441};
442template <size_t N>
443struct Tuple3<uint64_t, N> {
444 uint64x1x3_t raw;
445};
446template <>
447struct Tuple3<int64_t, 2> {
448 int64x2x3_t raw;
449};
450template <size_t N>
451struct Tuple3<int64_t, N> {
452 int64x1x3_t raw;
453};
454
455template <>
456struct Tuple3<float16_t, 8> {
457 uint16x8x3_t raw;
458};
459template <size_t N>
461 uint16x4x3_t raw;
462};
463template <>
464struct Tuple3<bfloat16_t, 8> {
465 uint16x8x3_t raw;
466};
467template <size_t N>
469 uint16x4x3_t raw;
470};
471
472template <>
473struct Tuple3<float32_t, 4> {
474 float32x4x3_t raw;
475};
476template <size_t N>
478 float32x2x3_t raw;
479};
480#if HWY_ARCH_ARM_A64
481template <>
482struct Tuple3<float64_t, 2> {
483 float64x2x3_t raw;
484};
485template <size_t N>
486struct Tuple3<float64_t, N> {
487 float64x1x3_t raw;
488};
489#endif // HWY_ARCH_ARM_A64
490
491template <>
492struct Tuple4<uint8_t, 16> {
493 uint8x16x4_t raw;
494};
495template <size_t N>
496struct Tuple4<uint8_t, N> {
497 uint8x8x4_t raw;
498};
499template <>
500struct Tuple4<int8_t, 16> {
501 int8x16x4_t raw;
502};
503template <size_t N>
504struct Tuple4<int8_t, N> {
505 int8x8x4_t raw;
506};
507template <>
508struct Tuple4<uint16_t, 8> {
509 uint16x8x4_t raw;
510};
511template <size_t N>
512struct Tuple4<uint16_t, N> {
513 uint16x4x4_t raw;
514};
515template <>
516struct Tuple4<int16_t, 8> {
517 int16x8x4_t raw;
518};
519template <size_t N>
520struct Tuple4<int16_t, N> {
521 int16x4x4_t raw;
522};
523template <>
524struct Tuple4<uint32_t, 4> {
525 uint32x4x4_t raw;
526};
527template <size_t N>
528struct Tuple4<uint32_t, N> {
529 uint32x2x4_t raw;
530};
531template <>
532struct Tuple4<int32_t, 4> {
533 int32x4x4_t raw;
534};
535template <size_t N>
536struct Tuple4<int32_t, N> {
537 int32x2x4_t raw;
538};
539template <>
540struct Tuple4<uint64_t, 2> {
541 uint64x2x4_t raw;
542};
543template <size_t N>
544struct Tuple4<uint64_t, N> {
545 uint64x1x4_t raw;
546};
547template <>
548struct Tuple4<int64_t, 2> {
549 int64x2x4_t raw;
550};
551template <size_t N>
552struct Tuple4<int64_t, N> {
553 int64x1x4_t raw;
554};
555
556template <>
557struct Tuple4<float16_t, 8> {
558 uint16x8x4_t raw;
559};
560template <size_t N>
562 uint16x4x4_t raw;
563};
564template <>
565struct Tuple4<bfloat16_t, 8> {
566 uint16x8x4_t raw;
567};
568template <size_t N>
570 uint16x4x4_t raw;
571};
572
573template <>
574struct Tuple4<float32_t, 4> {
575 float32x4x4_t raw;
576};
577template <size_t N>
579 float32x2x4_t raw;
580};
581#if HWY_ARCH_ARM_A64
582template <>
583struct Tuple4<float64_t, 2> {
584 float64x2x4_t raw;
585};
586template <size_t N>
587struct Tuple4<float64_t, N> {
588 float64x1x4_t raw;
589};
590#endif // HWY_ARCH_ARM_A64
591
592template <typename T, size_t N>
593struct Raw128;
594
595// 128
596template <>
597struct Raw128<uint8_t, 16> {
598 using type = uint8x16_t;
599};
600
601template <>
602struct Raw128<uint16_t, 8> {
603 using type = uint16x8_t;
604};
605
606template <>
607struct Raw128<uint32_t, 4> {
608 using type = uint32x4_t;
609};
610
611template <>
612struct Raw128<uint64_t, 2> {
613 using type = uint64x2_t;
614};
615
616template <>
617struct Raw128<int8_t, 16> {
618 using type = int8x16_t;
619};
620
621template <>
622struct Raw128<int16_t, 8> {
623 using type = int16x8_t;
624};
625
626template <>
627struct Raw128<int32_t, 4> {
628 using type = int32x4_t;
629};
630
631template <>
632struct Raw128<int64_t, 2> {
633 using type = int64x2_t;
634};
635
636template <>
637struct Raw128<float16_t, 8> {
638 using type = uint16x8_t;
639};
640
641template <>
642struct Raw128<bfloat16_t, 8> {
643 using type = uint16x8_t;
644};
645
646template <>
647struct Raw128<float, 4> {
648 using type = float32x4_t;
649};
650
651#if HWY_ARCH_ARM_A64
652template <>
653struct Raw128<double, 2> {
654 using type = float64x2_t;
655};
656#endif
657
658// 64
659template <>
660struct Raw128<uint8_t, 8> {
661 using type = uint8x8_t;
662};
663
664template <>
665struct Raw128<uint16_t, 4> {
666 using type = uint16x4_t;
667};
668
669template <>
670struct Raw128<uint32_t, 2> {
671 using type = uint32x2_t;
672};
673
674template <>
675struct Raw128<uint64_t, 1> {
676 using type = uint64x1_t;
677};
678
679template <>
680struct Raw128<int8_t, 8> {
681 using type = int8x8_t;
682};
683
684template <>
685struct Raw128<int16_t, 4> {
686 using type = int16x4_t;
687};
688
689template <>
690struct Raw128<int32_t, 2> {
691 using type = int32x2_t;
692};
693
694template <>
695struct Raw128<int64_t, 1> {
696 using type = int64x1_t;
697};
698
699template <>
700struct Raw128<float16_t, 4> {
701 using type = uint16x4_t;
702};
703
704template <>
705struct Raw128<bfloat16_t, 4> {
706 using type = uint16x4_t;
707};
708
709template <>
710struct Raw128<float, 2> {
711 using type = float32x2_t;
712};
713
714#if HWY_ARCH_ARM_A64
715template <>
716struct Raw128<double, 1> {
717 using type = float64x1_t;
718};
719#endif
720
721// 32 (same as 64)
722template <>
723struct Raw128<uint8_t, 4> : public Raw128<uint8_t, 8> {};
724
725template <>
726struct Raw128<uint16_t, 2> : public Raw128<uint16_t, 4> {};
727
728template <>
729struct Raw128<uint32_t, 1> : public Raw128<uint32_t, 2> {};
730
731template <>
732struct Raw128<int8_t, 4> : public Raw128<int8_t, 8> {};
733
734template <>
735struct Raw128<int16_t, 2> : public Raw128<int16_t, 4> {};
736
737template <>
738struct Raw128<int32_t, 1> : public Raw128<int32_t, 2> {};
739
740template <>
741struct Raw128<float16_t, 2> : public Raw128<float16_t, 4> {};
742
743template <>
744struct Raw128<bfloat16_t, 2> : public Raw128<bfloat16_t, 4> {};
745
746template <>
747struct Raw128<float, 1> : public Raw128<float, 2> {};
748
749// 16 (same as 64)
750template <>
751struct Raw128<uint8_t, 2> : public Raw128<uint8_t, 8> {};
752
753template <>
754struct Raw128<uint16_t, 1> : public Raw128<uint16_t, 4> {};
755
756template <>
757struct Raw128<int8_t, 2> : public Raw128<int8_t, 8> {};
758
759template <>
760struct Raw128<int16_t, 1> : public Raw128<int16_t, 4> {};
761
762template <>
763struct Raw128<float16_t, 1> : public Raw128<float16_t, 4> {};
764
765template <>
766struct Raw128<bfloat16_t, 1> : public Raw128<bfloat16_t, 4> {};
767
768// 8 (same as 64)
769template <>
770struct Raw128<uint8_t, 1> : public Raw128<uint8_t, 8> {};
771
772template <>
773struct Raw128<int8_t, 1> : public Raw128<int8_t, 8> {};
774
775} // namespace detail
776
777template <typename T, size_t N = 16 / sizeof(T)>
778class Vec128 {
780
781 public:
782 using PrivateT = T; // only for DFromV
783 static constexpr size_t kPrivateN = N; // only for DFromV
784
786 Vec128(const Vec128&) = default;
787 Vec128& operator=(const Vec128&) = default;
788 HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}
789
790 // Compound assignment. Only usable if there is a corresponding non-member
791 // binary operator overload. For example, only f32 and f64 support division.
793 return *this = (*this * other);
794 }
796 return *this = (*this / other);
797 }
799 return *this = (*this + other);
800 }
802 return *this = (*this - other);
803 }
805 return *this = (*this & other);
806 }
808 return *this = (*this | other);
809 }
811 return *this = (*this ^ other);
812 }
813
815};
816
817template <typename T>
818using Vec64 = Vec128<T, 8 / sizeof(T)>;
819
820template <typename T>
821using Vec32 = Vec128<T, 4 / sizeof(T)>;
822
823// FF..FF or 0.
824template <typename T, size_t N = 16 / sizeof(T)>
825class Mask128 {
826 // ARM C Language Extensions return and expect unsigned type.
827 using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
828
829 public:
831 Mask128(const Mask128&) = default;
832 Mask128& operator=(const Mask128&) = default;
833 HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {}
834
836};
837
838template <typename T>
839using Mask64 = Mask128<T, 8 / sizeof(T)>;
840
841template <class V>
843
844template <class V>
845using TFromV = typename V::PrivateT;
846
847// ------------------------------ BitCast
848
849namespace detail {
850
851// Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
852// vreinterpret*_u8_*() set of functions.
853#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
854#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
855 Vec128<uint8_t, size * sizeof(type##_t)>
856#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
857#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
858
859// Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
860template <size_t N>
864
865HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_,
866 HWY_CAST_TO_U8)
867HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
868HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
869HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
870HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
871
872// Special cases for [b]float16_t, which have the same Raw as uint16_t.
873template <size_t N>
877template <size_t N>
881
882#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
883#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
884#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
885#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
886
887template <size_t N>
892
893// 64-bit or less:
894
895template <size_t N, HWY_IF_LE64(int8_t, N)>
898 return Vec128<int8_t, N>(vreinterpret_s8_u8(v.raw));
899}
900template <size_t N, HWY_IF_LE64(uint16_t, N)>
905template <size_t N, HWY_IF_LE64(int16_t, N)>
910template <size_t N, HWY_IF_LE64(uint32_t, N)>
915template <size_t N, HWY_IF_LE64(int32_t, N)>
920template <size_t N, HWY_IF_LE64(float, N)>
923 return Vec128<float, N>(vreinterpret_f32_u8(v.raw));
924}
927 return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw));
928}
931 return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
932}
933#if HWY_ARCH_ARM_A64
934HWY_INLINE Vec64<double> BitCastFromByte(Full64<double> /* tag */,
936 return Vec64<double>(vreinterpret_f64_u8(v.raw));
937}
938#endif
939
940// 128-bit full:
941
943 Vec128<uint8_t> v) {
944 return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
945}
947 Vec128<uint8_t> v) {
948 return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
949}
951 Vec128<uint8_t> v) {
952 return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
953}
955 Vec128<uint8_t> v) {
956 return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
957}
959 Vec128<uint8_t> v) {
960 return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
961}
963 Vec128<uint8_t> v) {
964 return Vec128<float>(vreinterpretq_f32_u8(v.raw));
965}
967 Vec128<uint8_t> v) {
968 return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
969}
971 Vec128<uint8_t> v) {
972 return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
973}
974
975#if HWY_ARCH_ARM_A64
976HWY_INLINE Vec128<double> BitCastFromByte(Full128<double> /* tag */,
977 Vec128<uint8_t> v) {
978 return Vec128<double>(vreinterpretq_f64_u8(v.raw));
979}
980#endif
981
982// Special cases for [b]float16_t, which have the same Raw as uint16_t.
983template <size_t N>
988template <size_t N>
993
994} // namespace detail
995
996template <typename T, size_t N, typename FromT>
998 Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
999 return detail::BitCastFromByte(d, detail::BitCastToByte(v));
1000}
1001
1002// ------------------------------ Set
1003
1004// Returns a vector with all lanes set to "t".
1005#define HWY_NEON_BUILD_TPL_HWY_SET1
1006#define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type##_t, size>
1007#define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \
1008 Simd<type##_t, size, 0> /* tag */, const type##_t t
1009#define HWY_NEON_BUILD_ARG_HWY_SET1 t
1010
1011HWY_NEON_DEF_FUNCTION_ALL_TYPES(Set, vdup, _n_, HWY_SET1)
1012
1013#undef HWY_NEON_BUILD_TPL_HWY_SET1
1014#undef HWY_NEON_BUILD_RET_HWY_SET1
1015#undef HWY_NEON_BUILD_PARAM_HWY_SET1
1016#undef HWY_NEON_BUILD_ARG_HWY_SET1
1017
1018// Returns an all-zero vector.
1019template <typename T, size_t N>
1021 return Set(d, 0);
1022}
1023
1024template <size_t N>
1028
1029template <class D>
1030using VFromD = decltype(Zero(D()));
1031
1032HWY_DIAGNOSTICS(push)
1033HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
1035 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
1036#endif
1037
1038// Returns a vector with uninitialized elements.
1039template <typename T, size_t N>
1040HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /*d*/) {
1041 typename detail::Raw128<T, N>::type a;
1042 return Vec128<T, N>(a);
1043}
1044
1045HWY_DIAGNOSTICS(pop)
1046
1047// Returns a vector with lane i=[0, N) set to "first" + i.
1048template <typename T, size_t N, typename T2>
1049Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
1050 HWY_ALIGN T lanes[16 / sizeof(T)];
1051 for (size_t i = 0; i < 16 / sizeof(T); ++i) {
1052 lanes[i] =
1053 AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
1054 }
1055 return Load(d, lanes);
1056}
1057
1058// ------------------------------ GetLane
1059
1060namespace detail {
1061#define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane>
1062#define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t
1063#define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v
1064#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
1065
1066HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET)
1067
1068#undef HWY_NEON_BUILD_TPL_HWY_GET
1069#undef HWY_NEON_BUILD_RET_HWY_GET
1070#undef HWY_NEON_BUILD_PARAM_HWY_GET
1071#undef HWY_NEON_BUILD_ARG_HWY_GET
1072
1073} // namespace detail
1074
1075template <class V>
1077 return detail::GetLane<0>(v);
1078}
1079
1080// ------------------------------ ExtractLane
1081
1082// Requires one overload per vector length because GetLane<3> is a compile error
1083// if v is a uint32x2_t.
1084template <typename T>
1085HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
1086 HWY_DASSERT(i == 0);
1087 (void)i;
1088 return detail::GetLane<0>(v);
1089}
1090
1091template <typename T>
1092HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
1093#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1094 if (__builtin_constant_p(i)) {
1095 switch (i) {
1096 case 0:
1097 return detail::GetLane<0>(v);
1098 case 1:
1099 return detail::GetLane<1>(v);
1100 }
1101 }
1102#endif
1103 alignas(16) T lanes[2];
1104 Store(v, DFromV<decltype(v)>(), lanes);
1105 return lanes[i];
1106}
1107
1108template <typename T>
1109HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
1110#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1111 if (__builtin_constant_p(i)) {
1112 switch (i) {
1113 case 0:
1114 return detail::GetLane<0>(v);
1115 case 1:
1116 return detail::GetLane<1>(v);
1117 case 2:
1118 return detail::GetLane<2>(v);
1119 case 3:
1120 return detail::GetLane<3>(v);
1121 }
1122 }
1123#endif
1124 alignas(16) T lanes[4];
1125 Store(v, DFromV<decltype(v)>(), lanes);
1126 return lanes[i];
1127}
1128
1129template <typename T>
1130HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
1131#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1132 if (__builtin_constant_p(i)) {
1133 switch (i) {
1134 case 0:
1135 return detail::GetLane<0>(v);
1136 case 1:
1137 return detail::GetLane<1>(v);
1138 case 2:
1139 return detail::GetLane<2>(v);
1140 case 3:
1141 return detail::GetLane<3>(v);
1142 case 4:
1143 return detail::GetLane<4>(v);
1144 case 5:
1145 return detail::GetLane<5>(v);
1146 case 6:
1147 return detail::GetLane<6>(v);
1148 case 7:
1149 return detail::GetLane<7>(v);
1150 }
1151 }
1152#endif
1153 alignas(16) T lanes[8];
1154 Store(v, DFromV<decltype(v)>(), lanes);
1155 return lanes[i];
1156}
1157
1158template <typename T>
1159HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
1160#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1161 if (__builtin_constant_p(i)) {
1162 switch (i) {
1163 case 0:
1164 return detail::GetLane<0>(v);
1165 case 1:
1166 return detail::GetLane<1>(v);
1167 case 2:
1168 return detail::GetLane<2>(v);
1169 case 3:
1170 return detail::GetLane<3>(v);
1171 case 4:
1172 return detail::GetLane<4>(v);
1173 case 5:
1174 return detail::GetLane<5>(v);
1175 case 6:
1176 return detail::GetLane<6>(v);
1177 case 7:
1178 return detail::GetLane<7>(v);
1179 case 8:
1180 return detail::GetLane<8>(v);
1181 case 9:
1182 return detail::GetLane<9>(v);
1183 case 10:
1184 return detail::GetLane<10>(v);
1185 case 11:
1186 return detail::GetLane<11>(v);
1187 case 12:
1188 return detail::GetLane<12>(v);
1189 case 13:
1190 return detail::GetLane<13>(v);
1191 case 14:
1192 return detail::GetLane<14>(v);
1193 case 15:
1194 return detail::GetLane<15>(v);
1195 }
1196 }
1197#endif
1198 alignas(16) T lanes[16];
1199 Store(v, DFromV<decltype(v)>(), lanes);
1200 return lanes[i];
1201}
1202
1203// ------------------------------ InsertLane
1204
1205namespace detail {
1206#define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane>
1207#define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size>
1208#define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \
1209 Vec128<type##_t, size> v, type##_t t
1210#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
1211
1212HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)
1213
1214#undef HWY_NEON_BUILD_TPL_HWY_INSERT
1215#undef HWY_NEON_BUILD_RET_HWY_INSERT
1216#undef HWY_NEON_BUILD_PARAM_HWY_INSERT
1217#undef HWY_NEON_BUILD_ARG_HWY_INSERT
1218
1219} // namespace detail
1220
1221// Requires one overload per vector length because InsertLane<3> may be a
1222// compile error.
1223
1224template <typename T>
1226 HWY_DASSERT(i == 0);
1227 (void)i;
1228 return Set(DFromV<decltype(v)>(), t);
1229}
1230
1231template <typename T>
1233#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1234 if (__builtin_constant_p(i)) {
1235 switch (i) {
1236 case 0:
1237 return detail::InsertLane<0>(v, t);
1238 case 1:
1239 return detail::InsertLane<1>(v, t);
1240 }
1241 }
1242#endif
1243 const DFromV<decltype(v)> d;
1244 alignas(16) T lanes[2];
1245 Store(v, d, lanes);
1246 lanes[i] = t;
1247 return Load(d, lanes);
1248}
1249
1250template <typename T>
1252#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1253 if (__builtin_constant_p(i)) {
1254 switch (i) {
1255 case 0:
1256 return detail::InsertLane<0>(v, t);
1257 case 1:
1258 return detail::InsertLane<1>(v, t);
1259 case 2:
1260 return detail::InsertLane<2>(v, t);
1261 case 3:
1262 return detail::InsertLane<3>(v, t);
1263 }
1264 }
1265#endif
1266 const DFromV<decltype(v)> d;
1267 alignas(16) T lanes[4];
1268 Store(v, d, lanes);
1269 lanes[i] = t;
1270 return Load(d, lanes);
1271}
1272
1273template <typename T>
1275#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1276 if (__builtin_constant_p(i)) {
1277 switch (i) {
1278 case 0:
1279 return detail::InsertLane<0>(v, t);
1280 case 1:
1281 return detail::InsertLane<1>(v, t);
1282 case 2:
1283 return detail::InsertLane<2>(v, t);
1284 case 3:
1285 return detail::InsertLane<3>(v, t);
1286 case 4:
1287 return detail::InsertLane<4>(v, t);
1288 case 5:
1289 return detail::InsertLane<5>(v, t);
1290 case 6:
1291 return detail::InsertLane<6>(v, t);
1292 case 7:
1293 return detail::InsertLane<7>(v, t);
1294 }
1295 }
1296#endif
1297 const DFromV<decltype(v)> d;
1298 alignas(16) T lanes[8];
1299 Store(v, d, lanes);
1300 lanes[i] = t;
1301 return Load(d, lanes);
1302}
1303
1304template <typename T>
1306#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1307 if (__builtin_constant_p(i)) {
1308 switch (i) {
1309 case 0:
1310 return detail::InsertLane<0>(v, t);
1311 case 1:
1312 return detail::InsertLane<1>(v, t);
1313 case 2:
1314 return detail::InsertLane<2>(v, t);
1315 case 3:
1316 return detail::InsertLane<3>(v, t);
1317 case 4:
1318 return detail::InsertLane<4>(v, t);
1319 case 5:
1320 return detail::InsertLane<5>(v, t);
1321 case 6:
1322 return detail::InsertLane<6>(v, t);
1323 case 7:
1324 return detail::InsertLane<7>(v, t);
1325 case 8:
1326 return detail::InsertLane<8>(v, t);
1327 case 9:
1328 return detail::InsertLane<9>(v, t);
1329 case 10:
1330 return detail::InsertLane<10>(v, t);
1331 case 11:
1332 return detail::InsertLane<11>(v, t);
1333 case 12:
1334 return detail::InsertLane<12>(v, t);
1335 case 13:
1336 return detail::InsertLane<13>(v, t);
1337 case 14:
1338 return detail::InsertLane<14>(v, t);
1339 case 15:
1340 return detail::InsertLane<15>(v, t);
1341 }
1342 }
1343#endif
1344 const DFromV<decltype(v)> d;
1345 alignas(16) T lanes[16];
1346 Store(v, d, lanes);
1347 lanes[i] = t;
1348 return Load(d, lanes);
1349}
1350
1351// ================================================== ARITHMETIC
1352
1353// ------------------------------ Addition
1354HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2)
1355
1356// ------------------------------ Subtraction
1357HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2)
1358
1359// ------------------------------ SumsOf8
1360
1361HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) {
1362 return Vec128<uint64_t>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw))));
1363}
1365 return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw))));
1366}
1367
1368// ------------------------------ SaturatedAdd
1369// Only defined for uint8_t, uint16_t and their signed versions, as in other
1370// architectures.
1371
1372// Returns a + b clamped to the destination range.
1373HWY_NEON_DEF_FUNCTION_INT_8(SaturatedAdd, vqadd, _, 2)
1374HWY_NEON_DEF_FUNCTION_INT_16(SaturatedAdd, vqadd, _, 2)
1375HWY_NEON_DEF_FUNCTION_UINT_8(SaturatedAdd, vqadd, _, 2)
1376HWY_NEON_DEF_FUNCTION_UINT_16(SaturatedAdd, vqadd, _, 2)
1377
1378// ------------------------------ SaturatedSub
1379
1380// Returns a - b clamped to the destination range.
1381HWY_NEON_DEF_FUNCTION_INT_8(SaturatedSub, vqsub, _, 2)
1382HWY_NEON_DEF_FUNCTION_INT_16(SaturatedSub, vqsub, _, 2)
1383HWY_NEON_DEF_FUNCTION_UINT_8(SaturatedSub, vqsub, _, 2)
1384HWY_NEON_DEF_FUNCTION_UINT_16(SaturatedSub, vqsub, _, 2)
1385
1386// Not part of API, used in implementation.
1387namespace detail {
1388HWY_NEON_DEF_FUNCTION_UINT_32(SaturatedSub, vqsub, _, 2)
1389HWY_NEON_DEF_FUNCTION_UINT_64(SaturatedSub, vqsub, _, 2)
1390HWY_NEON_DEF_FUNCTION_INT_32(SaturatedSub, vqsub, _, 2)
1391HWY_NEON_DEF_FUNCTION_INT_64(SaturatedSub, vqsub, _, 2)
1392} // namespace detail
1393
1394// ------------------------------ Average
1395
1396// Returns (a + b + 1) / 2
1397HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2)
1398HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2)
1399
1400// ------------------------------ Neg
1401
1402HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vneg, _, 1)
1403HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below
1404
1405HWY_API Vec64<int64_t> Neg(const Vec64<int64_t> v) {
1406#if HWY_ARCH_ARM_A64
1407 return Vec64<int64_t>(vneg_s64(v.raw));
1408#else
1409 return Zero(Full64<int64_t>()) - v;
1410#endif
1411}
1412
1414#if HWY_ARCH_ARM_A64
1415 return Vec128<int64_t>(vnegq_s64(v.raw));
1416#else
1417 return Zero(Full128<int64_t>()) - v;
1418#endif
1419}
1420
1421// ------------------------------ ShiftLeft
1422
1423// Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
1424#pragma push_macro("HWY_NEON_DEF_FUNCTION")
1425#undef HWY_NEON_DEF_FUNCTION
1426#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
1427 template <int kBits> \
1428 HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) { \
1429 return kBits == 0 ? v \
1430 : Vec128<type##_t, size>(HWY_NEON_EVAL( \
1431 prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
1432 }
1433
1434HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, ignored)
1435
1436HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored)
1437HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored)
1438
1439#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
1440
1441// ------------------------------ RotateRight (ShiftRight, Or)
1442
1443template <int kBits, size_t N>
1445 static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
1446 if (kBits == 0) return v;
1447 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
1448}
1449
1450template <int kBits, size_t N>
1452 static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
1453 if (kBits == 0) return v;
1454 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
1455}
1456
1457// NOTE: vxarq_u64 can be applied to uint64_t, but we do not yet have a
1458// mechanism for checking for extensions to ARMv8.
1459
1460// ------------------------------ Shl
1461
1463 const Vec128<uint8_t> bits) {
1464 return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
1465}
1466template <size_t N, HWY_IF_LE64(uint8_t, N)>
1468 const Vec128<uint8_t, N> bits) {
1469 return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
1470}
1471
1473 const Vec128<uint16_t> bits) {
1474 return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
1475}
1476template <size_t N, HWY_IF_LE64(uint16_t, N)>
1478 const Vec128<uint16_t, N> bits) {
1479 return Vec128<uint16_t, N>(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw)));
1480}
1481
1483 const Vec128<uint32_t> bits) {
1484 return Vec128<uint32_t>(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw)));
1485}
1486template <size_t N, HWY_IF_LE64(uint32_t, N)>
1488 const Vec128<uint32_t, N> bits) {
1489 return Vec128<uint32_t, N>(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw)));
1490}
1491
1493 const Vec128<uint64_t> bits) {
1494 return Vec128<uint64_t>(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw)));
1495}
1497 const Vec64<uint64_t> bits) {
1498 return Vec64<uint64_t>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
1499}
1500
1502 const Vec128<int8_t> bits) {
1503 return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
1504}
1505template <size_t N, HWY_IF_LE64(int8_t, N)>
1507 const Vec128<int8_t, N> bits) {
1508 return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw));
1509}
1510
1512 const Vec128<int16_t> bits) {
1513 return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw));
1514}
1515template <size_t N, HWY_IF_LE64(int16_t, N)>
1517 const Vec128<int16_t, N> bits) {
1518 return Vec128<int16_t, N>(vshl_s16(v.raw, bits.raw));
1519}
1520
1522 const Vec128<int32_t> bits) {
1523 return Vec128<int32_t>(vshlq_s32(v.raw, bits.raw));
1524}
1525template <size_t N, HWY_IF_LE64(int32_t, N)>
1527 const Vec128<int32_t, N> bits) {
1528 return Vec128<int32_t, N>(vshl_s32(v.raw, bits.raw));
1529}
1530
1532 const Vec128<int64_t> bits) {
1533 return Vec128<int64_t>(vshlq_s64(v.raw, bits.raw));
1534}
1536 const Vec64<int64_t> bits) {
1537 return Vec64<int64_t>(vshl_s64(v.raw, bits.raw));
1538}
1539
1540// ------------------------------ Shr (Neg)
1541
1543 const Vec128<uint8_t> bits) {
1544 const int8x16_t neg_bits = Neg(BitCast(Full128<int8_t>(), bits)).raw;
1545 return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
1546}
1547template <size_t N, HWY_IF_LE64(uint8_t, N)>
1549 const Vec128<uint8_t, N> bits) {
1550 const int8x8_t neg_bits = Neg(BitCast(Simd<int8_t, N, 0>(), bits)).raw;
1551 return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
1552}
1553
1555 const Vec128<uint16_t> bits) {
1556 const int16x8_t neg_bits = Neg(BitCast(Full128<int16_t>(), bits)).raw;
1557 return Vec128<uint16_t>(vshlq_u16(v.raw, neg_bits));
1558}
1559template <size_t N, HWY_IF_LE64(uint16_t, N)>
1561 const Vec128<uint16_t, N> bits) {
1562 const int16x4_t neg_bits = Neg(BitCast(Simd<int16_t, N, 0>(), bits)).raw;
1563 return Vec128<uint16_t, N>(vshl_u16(v.raw, neg_bits));
1564}
1565
1567 const Vec128<uint32_t> bits) {
1568 const int32x4_t neg_bits = Neg(BitCast(Full128<int32_t>(), bits)).raw;
1569 return Vec128<uint32_t>(vshlq_u32(v.raw, neg_bits));
1570}
1571template <size_t N, HWY_IF_LE64(uint32_t, N)>
1573 const Vec128<uint32_t, N> bits) {
1574 const int32x2_t neg_bits = Neg(BitCast(Simd<int32_t, N, 0>(), bits)).raw;
1575 return Vec128<uint32_t, N>(vshl_u32(v.raw, neg_bits));
1576}
1577
1579 const Vec128<uint64_t> bits) {
1580 const int64x2_t neg_bits = Neg(BitCast(Full128<int64_t>(), bits)).raw;
1581 return Vec128<uint64_t>(vshlq_u64(v.raw, neg_bits));
1582}
1584 const Vec64<uint64_t> bits) {
1585 const int64x1_t neg_bits = Neg(BitCast(Full64<int64_t>(), bits)).raw;
1586 return Vec64<uint64_t>(vshl_u64(v.raw, neg_bits));
1587}
1588
1590 const Vec128<int8_t> bits) {
1591 return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
1592}
1593template <size_t N, HWY_IF_LE64(int8_t, N)>
1595 const Vec128<int8_t, N> bits) {
1596 return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
1597}
1598
1600 const Vec128<int16_t> bits) {
1601 return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
1602}
1603template <size_t N, HWY_IF_LE64(int16_t, N)>
1605 const Vec128<int16_t, N> bits) {
1606 return Vec128<int16_t, N>(vshl_s16(v.raw, Neg(bits).raw));
1607}
1608
1610 const Vec128<int32_t> bits) {
1611 return Vec128<int32_t>(vshlq_s32(v.raw, Neg(bits).raw));
1612}
1613template <size_t N, HWY_IF_LE64(int32_t, N)>
1615 const Vec128<int32_t, N> bits) {
1616 return Vec128<int32_t, N>(vshl_s32(v.raw, Neg(bits).raw));
1617}
1618
1620 const Vec128<int64_t> bits) {
1621 return Vec128<int64_t>(vshlq_s64(v.raw, Neg(bits).raw));
1622}
1624 const Vec64<int64_t> bits) {
1625 return Vec64<int64_t>(vshl_s64(v.raw, Neg(bits).raw));
1626}
1627
1628// ------------------------------ ShiftLeftSame (Shl)
1629
1630template <typename T, size_t N>
1632 return v << Set(Simd<T, N, 0>(), static_cast<T>(bits));
1633}
1634template <typename T, size_t N>
1636 return v >> Set(Simd<T, N, 0>(), static_cast<T>(bits));
1637}
1638
1639// ------------------------------ Integer multiplication
1640
1641// Unsigned
1643 const Vec128<uint16_t> b) {
1644 return Vec128<uint16_t>(vmulq_u16(a.raw, b.raw));
1645}
1647 const Vec128<uint32_t> b) {
1648 return Vec128<uint32_t>(vmulq_u32(a.raw, b.raw));
1649}
1650
1651template <size_t N, HWY_IF_LE64(uint16_t, N)>
1656template <size_t N, HWY_IF_LE64(uint32_t, N)>
1661
1662// Signed
1664 const Vec128<int16_t> b) {
1665 return Vec128<int16_t>(vmulq_s16(a.raw, b.raw));
1666}
1668 const Vec128<int32_t> b) {
1669 return Vec128<int32_t>(vmulq_s32(a.raw, b.raw));
1670}
1671
1672template <size_t N, HWY_IF_LE64(uint16_t, N)>
1674 const Vec128<int16_t, N> b) {
1675 return Vec128<int16_t, N>(vmul_s16(a.raw, b.raw));
1676}
1677template <size_t N, HWY_IF_LE64(int32_t, N)>
1679 const Vec128<int32_t, N> b) {
1680 return Vec128<int32_t, N>(vmul_s32(a.raw, b.raw));
1681}
1682
1683// Returns the upper 16 bits of a * b in each lane.
1685 const Vec128<int16_t> b) {
1686 int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
1687#if HWY_ARCH_ARM_A64
1688 int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
1689#else
1690 int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
1691#endif
1692 return Vec128<int16_t>(
1693 vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
1694}
1696 const Vec128<uint16_t> b) {
1697 uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
1698#if HWY_ARCH_ARM_A64
1699 uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
1700#else
1701 uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
1702#endif
1703 return Vec128<uint16_t>(
1704 vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
1705}
1706
1707template <size_t N, HWY_IF_LE64(int16_t, N)>
1709 const Vec128<int16_t, N> b) {
1710 int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw));
1711 return Vec128<int16_t, N>(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo)));
1712}
1713template <size_t N, HWY_IF_LE64(uint16_t, N)>
1715 const Vec128<uint16_t, N> b) {
1716 uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw));
1717 return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
1718}
1719
1723template <size_t N, HWY_IF_LE64(int16_t, N)>
1728
1729// ------------------------------ Floating-point mul / div
1730
1731HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
1732
1733// Approximate reciprocal
1734HWY_API Vec128<float> ApproximateReciprocal(const Vec128<float> v) {
1735 return Vec128<float>(vrecpeq_f32(v.raw));
1736}
1737template <size_t N>
1741
1742#if HWY_ARCH_ARM_A64
1743HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
1744#else
1745// Not defined on armv7: approximate
1746namespace detail {
1747
1749 const Vec128<float> recip, const Vec128<float> divisor) {
1750 return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw));
1751}
1752template <size_t N>
1754 const Vec128<float, N> recip, Vec128<float, N> divisor) {
1755 return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw));
1756}
1757
1758} // namespace detail
1759
1760template <size_t N>
1762 const Vec128<float, N> b) {
1763 auto x = ApproximateReciprocal(b);
1764 x *= detail::ReciprocalNewtonRaphsonStep(x, b);
1765 x *= detail::ReciprocalNewtonRaphsonStep(x, b);
1766 x *= detail::ReciprocalNewtonRaphsonStep(x, b);
1767 return a * x;
1768}
1769#endif
1770
1771// ------------------------------ Absolute value of difference.
1772
1774 return Vec128<float>(vabdq_f32(a.raw, b.raw));
1775}
1776template <size_t N, HWY_IF_LE64(float, N)>
1778 const Vec128<float, N> b) {
1779 return Vec128<float, N>(vabd_f32(a.raw, b.raw));
1780}
1781
1782// ------------------------------ Floating-point multiply-add variants
1783
1784// Returns add + mul * x
1785#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1786template <size_t N, HWY_IF_LE64(float, N)>
1787HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
1788 const Vec128<float, N> x,
1789 const Vec128<float, N> add) {
1790 return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
1791}
1792HWY_API Vec128<float> MulAdd(const Vec128<float> mul, const Vec128<float> x,
1793 const Vec128<float> add) {
1794 return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
1795}
1796#else
1797// Emulate FMA for floats.
1798template <size_t N>
1800 const Vec128<float, N> x,
1801 const Vec128<float, N> add) {
1802 return mul * x + add;
1803}
1804#endif
1805
1806#if HWY_ARCH_ARM_A64
1807HWY_API Vec64<double> MulAdd(const Vec64<double> mul, const Vec64<double> x,
1808 const Vec64<double> add) {
1809 return Vec64<double>(vfma_f64(add.raw, mul.raw, x.raw));
1810}
1811HWY_API Vec128<double> MulAdd(const Vec128<double> mul, const Vec128<double> x,
1812 const Vec128<double> add) {
1813 return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
1814}
1815#endif
1816
1817// Returns add - mul * x
1818#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1819template <size_t N, HWY_IF_LE64(float, N)>
1820HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
1821 const Vec128<float, N> x,
1822 const Vec128<float, N> add) {
1823 return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
1824}
1825HWY_API Vec128<float> NegMulAdd(const Vec128<float> mul, const Vec128<float> x,
1826 const Vec128<float> add) {
1827 return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
1828}
1829#else
1830// Emulate FMA for floats.
1831template <size_t N>
1833 const Vec128<float, N> x,
1834 const Vec128<float, N> add) {
1835 return add - mul * x;
1836}
1837#endif
1838
1839#if HWY_ARCH_ARM_A64
1840HWY_API Vec64<double> NegMulAdd(const Vec64<double> mul, const Vec64<double> x,
1841 const Vec64<double> add) {
1842 return Vec64<double>(vfms_f64(add.raw, mul.raw, x.raw));
1843}
1844HWY_API Vec128<double> NegMulAdd(const Vec128<double> mul,
1845 const Vec128<double> x,
1846 const Vec128<double> add) {
1847 return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
1848}
1849#endif
1850
1851// Returns mul * x - sub
1852template <size_t N>
1854 const Vec128<float, N> x,
1855 const Vec128<float, N> sub) {
1856 return MulAdd(mul, x, Neg(sub));
1857}
1858
1859// Returns -mul * x - sub
1860template <size_t N>
1862 const Vec128<float, N> x,
1863 const Vec128<float, N> sub) {
1864 return Neg(MulAdd(mul, x, sub));
1865}
1866
1867#if HWY_ARCH_ARM_A64
1868template <size_t N>
1869HWY_API Vec128<double, N> MulSub(const Vec128<double, N> mul,
1870 const Vec128<double, N> x,
1871 const Vec128<double, N> sub) {
1872 return MulAdd(mul, x, Neg(sub));
1873}
1874template <size_t N>
1875HWY_API Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
1876 const Vec128<double, N> x,
1877 const Vec128<double, N> sub) {
1878 return Neg(MulAdd(mul, x, sub));
1879}
1880#endif
1881
1882// ------------------------------ Floating-point square root (IfThenZeroElse)
1883
1884// Approximate reciprocal square root
1886 return Vec128<float>(vrsqrteq_f32(v.raw));
1887}
1888template <size_t N>
1892
1893// Full precision square root
1894#if HWY_ARCH_ARM_A64
1895HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1)
1896#else
1897namespace detail {
1898
1900 const Vec128<float> recip) {
1901 return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw));
1902}
1903template <size_t N>
1905 Vec128<float, N> recip) {
1906 return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw));
1907}
1908
1909} // namespace detail
1910
1911// Not defined on armv7: approximate
1912template <size_t N>
1914 auto recip = ApproximateReciprocalSqrt(v);
1915
1916 recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1917 recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1918 recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1919
1920 const auto root = v * recip;
1921 return IfThenZeroElse(v == Zero(Simd<float, N, 0>()), root);
1922}
1923#endif
1924
1925// ================================================== LOGICAL
1926
1927// ------------------------------ Not
1928
1929// There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
1930template <typename T>
1932 const Full128<T> d;
1933 const Repartition<uint8_t, decltype(d)> d8;
1934 return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
1935}
1936template <typename T, size_t N, HWY_IF_LE64(T, N)>
1938 const Simd<T, N, 0> d;
1939 const Repartition<uint8_t, decltype(d)> d8;
1940 using V8 = decltype(Zero(d8));
1941 return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
1942}
1943
1944// ------------------------------ And
1945HWY_NEON_DEF_FUNCTION_INTS_UINTS(And, vand, _, 2)
1946
1947// Uses the u32/64 defined above.
1948template <typename T, size_t N, HWY_IF_FLOAT(T)>
1949HWY_API Vec128<T, N> And(const Vec128<T, N> a, const Vec128<T, N> b) {
1950 const DFromV<decltype(a)> d;
1951 const RebindToUnsigned<decltype(d)> du;
1952 return BitCast(d, BitCast(du, a) & BitCast(du, b));
1953}
1954
1955// ------------------------------ AndNot
1956
1957namespace detail {
1958// reversed_andnot returns a & ~b.
1959HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2)
1960} // namespace detail
1961
1962// Returns ~not_mask & mask.
1963template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
1965 const Vec128<T, N> mask) {
1966 return detail::reversed_andnot(mask, not_mask);
1967}
1968
1969// Uses the u32/64 defined above.
1970template <typename T, size_t N, HWY_IF_FLOAT(T)>
1971HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask,
1972 const Vec128<T, N> mask) {
1973 const DFromV<decltype(mask)> d;
1974 const RebindToUnsigned<decltype(d)> du;
1975 VFromD<decltype(du)> ret =
1976 detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask));
1977 return BitCast(d, ret);
1978}
1979
1980// ------------------------------ Or
1981
1983
1984// Uses the u32/64 defined above.
1985template <typename T, size_t N, HWY_IF_FLOAT(T)>
1986HWY_API Vec128<T, N> Or(const Vec128<T, N> a, const Vec128<T, N> b) {
1987 const DFromV<decltype(a)> d;
1988 const RebindToUnsigned<decltype(d)> du;
1989 return BitCast(d, BitCast(du, a) | BitCast(du, b));
1990}
1991
1992// ------------------------------ Xor
1993
1994HWY_NEON_DEF_FUNCTION_INTS_UINTS(Xor, veor, _, 2)
1995
1996// Uses the u32/64 defined above.
1997template <typename T, size_t N, HWY_IF_FLOAT(T)>
1998HWY_API Vec128<T, N> Xor(const Vec128<T, N> a, const Vec128<T, N> b) {
1999 const DFromV<decltype(a)> d;
2000 const RebindToUnsigned<decltype(d)> du;
2001 return BitCast(d, BitCast(du, a) ^ BitCast(du, b));
2002}
2003
2004// ------------------------------ Xor3
2005#if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SHA3)
2006HWY_NEON_DEF_FUNCTION_FULL_UI(Xor3, veor3, _, 3)
2007
2008// Half vectors are not natively supported. Two Xor are likely more efficient
2009// than Combine to 128-bit.
2010template <typename T, size_t N, HWY_IF_LE64(T, N), HWY_IF_NOT_FLOAT(T)>
2011HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
2012 return Xor(x1, Xor(x2, x3));
2013}
2014
2015template <typename T, size_t N, HWY_IF_FLOAT(T)>
2016HWY_API Vec128<T, N> Xor3(const Vec128<T, N> x1, const Vec128<T, N> x2,
2017 const Vec128<T, N> x3) {
2018 const DFromV<decltype(x1)> d;
2019 const RebindToUnsigned<decltype(d)> du;
2020 return BitCast(d, Xor3(BitCast(du, x1), BitCast(du, x2), BitCast(du, x3)));
2021}
2022
2023#else
2024template <typename T, size_t N>
2026 return Xor(x1, Xor(x2, x3));
2027}
2028#endif
2029
2030// ------------------------------ Or3
2031
2032template <typename T, size_t N>
2034 return Or(o1, Or(o2, o3));
2035}
2036
2037// ------------------------------ OrAnd
2038
2039template <typename T, size_t N>
2041 return Or(o, And(a1, a2));
2042}
2043
2044// ------------------------------ IfVecThenElse
2045
2046template <typename T, size_t N>
2048 Vec128<T, N> no) {
2049 return IfThenElse(MaskFromVec(mask), yes, no);
2050}
2051
2052// ------------------------------ Operator overloads (internal-only if float)
2053
2054template <typename T, size_t N>
2056 return And(a, b);
2057}
2058
2059template <typename T, size_t N>
2061 return Or(a, b);
2062}
2063
2064template <typename T, size_t N>
2066 return Xor(a, b);
2067}
2068
2069// ------------------------------ PopulationCount
2070
2071#ifdef HWY_NATIVE_POPCNT
2072#undef HWY_NATIVE_POPCNT
2073#else
2074#define HWY_NATIVE_POPCNT
2075#endif
2076
2077namespace detail {
2078
2079template <typename T>
2081 const Full128<uint8_t> d8;
2082 return Vec128<T>(vcntq_u8(BitCast(d8, v).raw));
2083}
2084template <typename T, size_t N, HWY_IF_LE64(T, N)>
2086 Vec128<T, N> v) {
2087 const Simd<uint8_t, N, 0> d8;
2088 return Vec128<T, N>(vcnt_u8(BitCast(d8, v).raw));
2089}
2090
2091// ARM lacks popcount for lane sizes > 1, so take pairwise sums of the bytes.
2092template <typename T>
2094 const Full128<uint8_t> d8;
2095 const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
2096 return Vec128<T>(vpaddlq_u8(bytes));
2097}
2098template <typename T, size_t N, HWY_IF_LE64(T, N)>
2100 Vec128<T, N> v) {
2102 const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
2103 return Vec128<T, N>(vpaddl_u8(bytes));
2104}
2105
2106template <typename T>
2108 const Full128<uint8_t> d8;
2109 const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
2110 return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
2111}
2112template <typename T, size_t N, HWY_IF_LE64(T, N)>
2114 Vec128<T, N> v) {
2116 const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
2117 return Vec128<T, N>(vpaddl_u16(vpaddl_u8(bytes)));
2118}
2119
2120template <typename T>
2122 const Full128<uint8_t> d8;
2123 const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
2124 return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
2125}
2126template <typename T, size_t N, HWY_IF_LE64(T, N)>
2128 Vec128<T, N> v) {
2130 const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
2131 return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
2132}
2133
2134} // namespace detail
2135
2136template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
2138 return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
2139}
2140
2141// ================================================== SIGN
2142
2143// ------------------------------ Abs
2144
2145// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
2147 return Vec128<int8_t>(vabsq_s8(v.raw));
2148}
2150 return Vec128<int16_t>(vabsq_s16(v.raw));
2151}
2153 return Vec128<int32_t>(vabsq_s32(v.raw));
2154}
2155// i64 is implemented after BroadcastSignBit.
2157 return Vec128<float>(vabsq_f32(v.raw));
2158}
2159
2160template <size_t N, HWY_IF_LE64(int8_t, N)>
2162 return Vec128<int8_t, N>(vabs_s8(v.raw));
2163}
2164template <size_t N, HWY_IF_LE64(int16_t, N)>
2166 return Vec128<int16_t, N>(vabs_s16(v.raw));
2167}
2168template <size_t N, HWY_IF_LE64(int32_t, N)>
2170 return Vec128<int32_t, N>(vabs_s32(v.raw));
2171}
2172template <size_t N, HWY_IF_LE64(float, N)>
2174 return Vec128<float, N>(vabs_f32(v.raw));
2175}
2176
2177#if HWY_ARCH_ARM_A64
2178HWY_API Vec128<double> Abs(const Vec128<double> v) {
2179 return Vec128<double>(vabsq_f64(v.raw));
2180}
2181
2182HWY_API Vec64<double> Abs(const Vec64<double> v) {
2183 return Vec64<double>(vabs_f64(v.raw));
2184}
2185#endif
2186
2187// ------------------------------ CopySign
2188
2189template <typename T, size_t N>
2191 const Vec128<T, N> sign) {
2192 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
2193 const auto msb = SignBit(Simd<T, N, 0>());
2194 return Or(AndNot(msb, magn), And(msb, sign));
2195}
2196
2197template <typename T, size_t N>
2199 const Vec128<T, N> sign) {
2200 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
2201 return Or(abs, And(SignBit(Simd<T, N, 0>()), sign));
2202}
2203
2204// ------------------------------ BroadcastSignBit
2205
2206template <typename T, size_t N, HWY_IF_SIGNED(T)>
2208 return ShiftRight<sizeof(T) * 8 - 1>(v);
2209}
2210
2211// ================================================== MASK
2212
2213// ------------------------------ To/from vector
2214
2215// Mask and Vec have the same representation (true = FF..FF).
2216template <typename T, size_t N>
2218 const Simd<MakeUnsigned<T>, N, 0> du;
2219 return Mask128<T, N>(BitCast(du, v).raw);
2220}
2221
2222template <typename T, size_t N>
2226
2227// ------------------------------ RebindMask
2228
2229template <typename TFrom, typename TTo, size_t N>
2231 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
2233}
2234
2235// ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a.
2236
2237#define HWY_NEON_BUILD_TPL_HWY_IF
2238#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
2239#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
2240 const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
2241 const Vec128<type##_t, size> no
2242#define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
2243
2244HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF)
2245
2246#undef HWY_NEON_BUILD_TPL_HWY_IF
2247#undef HWY_NEON_BUILD_RET_HWY_IF
2248#undef HWY_NEON_BUILD_PARAM_HWY_IF
2249#undef HWY_NEON_BUILD_ARG_HWY_IF
2250
2251// mask ? yes : 0
2252template <typename T, size_t N>
2254 const Vec128<T, N> yes) {
2255 return yes & VecFromMask(Simd<T, N, 0>(), mask);
2256}
2257
2258// mask ? 0 : no
2259template <typename T, size_t N>
2261 const Vec128<T, N> no) {
2262 return AndNot(VecFromMask(Simd<T, N, 0>(), mask), no);
2263}
2264
2265template <typename T, size_t N>
2267 Vec128<T, N> no) {
2268 static_assert(IsSigned<T>(), "Only works for signed/float");
2269 const Simd<T, N, 0> d;
2270 const RebindToSigned<decltype(d)> di;
2271
2273 return IfThenElse(m, yes, no);
2274}
2275
2276template <typename T, size_t N>
2278 const auto zero = Zero(Simd<T, N, 0>());
2279 return Max(zero, v);
2280}
2281
2282// ------------------------------ Mask logical
2283
2284template <typename T, size_t N>
2288
2289template <typename T, size_t N>
2294
2295template <typename T, size_t N>
2300
2301template <typename T, size_t N>
2306
2307template <typename T, size_t N>
2312
2313template <typename T, size_t N>
2318
2319// ================================================== COMPARE
2320
2321// Comparisons fill a lane with 1-bits if the condition is true, else 0.
2322
2323// ------------------------------ Shuffle2301 (for i64 compares)
2324
2325// Swap 32-bit halves in 64-bits
2327 return Vec64<uint32_t>(vrev64_u32(v.raw));
2328}
2330 return Vec64<int32_t>(vrev64_s32(v.raw));
2331}
2333 return Vec64<float>(vrev64_f32(v.raw));
2334}
2336 return Vec128<uint32_t>(vrev64q_u32(v.raw));
2337}
2339 return Vec128<int32_t>(vrev64q_s32(v.raw));
2340}
2342 return Vec128<float>(vrev64q_f32(v.raw));
2343}
2344
2345#define HWY_NEON_BUILD_TPL_HWY_COMPARE
2346#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
2347#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
2348 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
2349#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
2350
2351// ------------------------------ Equality
2352HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
2353#if HWY_ARCH_ARM_A64
2354HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
2355#else
2356// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
2357HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
2358HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
2359#endif
2360
2361// ------------------------------ Strict inequality (signed, float)
2362#if HWY_ARCH_ARM_A64
2363HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<, vclt, _, HWY_COMPARE)
2364#else
2365HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<, vclt, _, HWY_COMPARE)
2366HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
2367#endif
2368HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
2369
2370// ------------------------------ Weak inequality (float)
2371HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
2372
2373#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
2374#undef HWY_NEON_BUILD_RET_HWY_COMPARE
2375#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
2376#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
2377
2378// ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq)
2379
2380#if HWY_ARCH_ARM_V7
2381
2382template <size_t N>
2383HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
2384 const Vec128<int64_t, N> b) {
2385 const Simd<int32_t, N * 2, 0> d32;
2386 const Simd<int64_t, N, 0> d64;
2387 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
2388 const auto cmp64 = cmp32 & Shuffle2301(cmp32);
2389 return MaskFromVec(BitCast(d64, cmp64));
2390}
2391
2392template <size_t N>
2393HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
2394 const Vec128<uint64_t, N> b) {
2395 const Simd<uint32_t, N * 2, 0> d32;
2396 const Simd<uint64_t, N, 0> d64;
2397 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
2398 const auto cmp64 = cmp32 & Shuffle2301(cmp32);
2399 return MaskFromVec(BitCast(d64, cmp64));
2400}
2401
2402HWY_API Mask128<int64_t> operator<(const Vec128<int64_t> a,
2403 const Vec128<int64_t> b) {
2404 const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
2405 return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
2406}
2407HWY_API Mask128<int64_t, 1> operator<(const Vec64<int64_t> a,
2408 const Vec64<int64_t> b) {
2409 const int64x1_t sub = vqsub_s64(a.raw, b.raw);
2410 return MaskFromVec(BroadcastSignBit(Vec64<int64_t>(sub)));
2411}
2412
2413template <size_t N>
2414HWY_API Mask128<uint64_t, N> operator<(const Vec128<uint64_t, N> a,
2415 const Vec128<uint64_t, N> b) {
2416 const DFromV<decltype(a)> du;
2417 const RebindToSigned<decltype(du)> di;
2418 const Vec128<uint64_t, N> msb = AndNot(a, b) | AndNot(a ^ b, a - b);
2419 return MaskFromVec(BitCast(du, BroadcastSignBit(BitCast(di, msb))));
2420}
2421
2422#endif
2423
2424// ------------------------------ operator!= (operator==)
2425
2426// Customize HWY_NEON_DEF_FUNCTION to call 2 functions.
2427#pragma push_macro("HWY_NEON_DEF_FUNCTION")
2428#undef HWY_NEON_DEF_FUNCTION
2429// This cannot have _any_ template argument (in x86_128 we can at least have N
2430// as an argument), otherwise it is not more specialized than rewritten
2431// operator== in C++20, leading to compile errors.
2432#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
2433 HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a, \
2434 Vec128<type##_t, size> b) { \
2435 return Not(a == b); \
2436 }
2437
2438HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator!=, ignored, ignored, ignored)
2439
2440#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
2441
2442// ------------------------------ Reversed comparisons
2443
2444template <typename T, size_t N>
2448template <typename T, size_t N>
2452
2453// ------------------------------ FirstN (Iota, Lt)
2454
2455template <typename T, size_t N>
2457 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
2458 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
2459}
2460
2461// ------------------------------ TestBit (Eq)
2462
2463#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
2464#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
2465#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
2466 Vec128<type##_t, size> v, Vec128<type##_t, size> bit
2467#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
2468
2469#if HWY_ARCH_ARM_A64
2470HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
2471#else
2472// No 64-bit versions on armv7
2473HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
2474HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
2475
2476template <size_t N>
2478 Vec128<uint64_t, N> bit) {
2479 return (v & bit) == bit;
2480}
2481template <size_t N>
2483 Vec128<int64_t, N> bit) {
2484 return (v & bit) == bit;
2485}
2486
2487#endif
2488#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
2489#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
2490#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
2491#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
2492
2493// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
2495#if HWY_ARCH_ARM_A64
2496 return Vec128<int64_t>(vabsq_s64(v.raw));
2497#else
2498 const auto zero = Zero(Full128<int64_t>());
2499 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2500#endif
2501}
2503#if HWY_ARCH_ARM_A64
2504 return Vec64<int64_t>(vabs_s64(v.raw));
2505#else
2506 const auto zero = Zero(Full64<int64_t>());
2507 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2508#endif
2509}
2510
2511// ------------------------------ Min (IfThenElse, BroadcastSignBit)
2512
2513// Unsigned
2515
2516template <size_t N>
2517HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
2518 const Vec128<uint64_t, N> b) {
2519#if HWY_ARCH_ARM_A64
2520 return IfThenElse(b < a, b, a);
2521#else
2522 const DFromV<decltype(a)> du;
2523 const RebindToSigned<decltype(du)> di;
2524 return BitCast(du, BitCast(di, a) - BitCast(di, detail::SaturatedSub(a, b)));
2525#endif
2526}
2527
2528// Signed
2529HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, vmin, _, 2)
2530
2531template <size_t N>
2532HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
2533 const Vec128<int64_t, N> b) {
2534#if HWY_ARCH_ARM_A64
2535 return IfThenElse(b < a, b, a);
2536#else
2537 const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
2538 return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
2539#endif
2540}
2541
2542// Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN.
2543#if HWY_ARCH_ARM_A64
2544HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vminnm, _, 2)
2545#else
2546HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2)
2547#endif
2548
2549// ------------------------------ Max (IfThenElse, BroadcastSignBit)
2550
2551// Unsigned (no u64)
2553
2554template <size_t N>
2555HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
2556 const Vec128<uint64_t, N> b) {
2557#if HWY_ARCH_ARM_A64
2558 return IfThenElse(b < a, a, b);
2559#else
2560 const DFromV<decltype(a)> du;
2561 const RebindToSigned<decltype(du)> di;
2562 return BitCast(du, BitCast(di, b) + BitCast(di, detail::SaturatedSub(a, b)));
2563#endif
2564}
2565
2566// Signed (no i64)
2567HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, vmax, _, 2)
2568
2569template <size_t N>
2570HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
2571 const Vec128<int64_t, N> b) {
2572#if HWY_ARCH_ARM_A64
2573 return IfThenElse(b < a, a, b);
2574#else
2575 const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
2576 return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
2577#endif
2578}
2579
2580// Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN.
2581#if HWY_ARCH_ARM_A64
2582HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmaxnm, _, 2)
2583#else
2584HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2)
2585#endif
2586
2587// ================================================== MEMORY
2588
2589// ------------------------------ Load 128
2590
2592 const uint8_t* HWY_RESTRICT unaligned) {
2593 return Vec128<uint8_t>(vld1q_u8(unaligned));
2594}
2596 const uint16_t* HWY_RESTRICT unaligned) {
2597 return Vec128<uint16_t>(vld1q_u16(unaligned));
2598}
2600 const uint32_t* HWY_RESTRICT unaligned) {
2601 return Vec128<uint32_t>(vld1q_u32(unaligned));
2602}
2604 const uint64_t* HWY_RESTRICT unaligned) {
2605 return Vec128<uint64_t>(vld1q_u64(unaligned));
2606}
2608 const int8_t* HWY_RESTRICT unaligned) {
2609 return Vec128<int8_t>(vld1q_s8(unaligned));
2610}
2612 const int16_t* HWY_RESTRICT unaligned) {
2613 return Vec128<int16_t>(vld1q_s16(unaligned));
2614}
2616 const int32_t* HWY_RESTRICT unaligned) {
2617 return Vec128<int32_t>(vld1q_s32(unaligned));
2618}
2620 const int64_t* HWY_RESTRICT unaligned) {
2621 return Vec128<int64_t>(vld1q_s64(unaligned));
2622}
2624 const float* HWY_RESTRICT unaligned) {
2625 return Vec128<float>(vld1q_f32(unaligned));
2626}
2627#if HWY_ARCH_ARM_A64
2628HWY_API Vec128<double> LoadU(Full128<double> /* tag */,
2629 const double* HWY_RESTRICT unaligned) {
2630 return Vec128<double>(vld1q_f64(unaligned));
2631}
2632#endif
2633
2634// ------------------------------ Load 64
2635
2637 const uint8_t* HWY_RESTRICT p) {
2638 return Vec64<uint8_t>(vld1_u8(p));
2639}
2641 const uint16_t* HWY_RESTRICT p) {
2642 return Vec64<uint16_t>(vld1_u16(p));
2643}
2645 const uint32_t* HWY_RESTRICT p) {
2646 return Vec64<uint32_t>(vld1_u32(p));
2647}
2649 const uint64_t* HWY_RESTRICT p) {
2650 return Vec64<uint64_t>(vld1_u64(p));
2651}
2653 const int8_t* HWY_RESTRICT p) {
2654 return Vec64<int8_t>(vld1_s8(p));
2655}
2657 const int16_t* HWY_RESTRICT p) {
2658 return Vec64<int16_t>(vld1_s16(p));
2659}
2661 const int32_t* HWY_RESTRICT p) {
2662 return Vec64<int32_t>(vld1_s32(p));
2663}
2665 const int64_t* HWY_RESTRICT p) {
2666 return Vec64<int64_t>(vld1_s64(p));
2667}
2669 const float* HWY_RESTRICT p) {
2670 return Vec64<float>(vld1_f32(p));
2671}
2672#if HWY_ARCH_ARM_A64
2673HWY_API Vec64<double> LoadU(Full64<double> /* tag */,
2674 const double* HWY_RESTRICT p) {
2675 return Vec64<double>(vld1_f64(p));
2676}
2677#endif
2678// ------------------------------ Load 32
2679
2680// Actual 32-bit broadcast load - used to implement the other lane types
2681// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
2683 const uint32_t* HWY_RESTRICT p) {
2684 return Vec32<uint32_t>(vld1_dup_u32(p));
2685}
2687 const int32_t* HWY_RESTRICT p) {
2688 return Vec32<int32_t>(vld1_dup_s32(p));
2689}
2691 return Vec32<float>(vld1_dup_f32(p));
2692}
2693
2694template <typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)> // 1 or 2 bytes
2696 const Repartition<uint32_t, decltype(d)> d32;
2697 uint32_t buf;
2698 CopyBytes<4>(p, &buf);
2699 return BitCast(d, LoadU(d32, &buf));
2700}
2701
2702// ------------------------------ Load 16
2703
2704// Actual 16-bit broadcast load - used to implement the other lane types
2705// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
2707 const uint16_t* HWY_RESTRICT p) {
2708 return Vec128<uint16_t, 1>(vld1_dup_u16(p));
2709}
2711 const int16_t* HWY_RESTRICT p) {
2712 return Vec128<int16_t, 1>(vld1_dup_s16(p));
2713}
2714
2715template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2717 const Repartition<uint16_t, decltype(d)> d16;
2718 uint16_t buf;
2719 CopyBytes<2>(p, &buf);
2720 return BitCast(d, LoadU(d16, &buf));
2721}
2722
2723// ------------------------------ Load 8
2724
2726 const uint8_t* HWY_RESTRICT p) {
2727 return Vec128<uint8_t, 1>(vld1_dup_u8(p));
2728}
2729
2731 const int8_t* HWY_RESTRICT p) {
2732 return Vec128<int8_t, 1>(vld1_dup_s8(p));
2733}
2734
2735// [b]float16_t use the same Raw as uint16_t, so forward to that.
2736template <size_t N>
2738 const float16_t* HWY_RESTRICT p) {
2739 const RebindToUnsigned<decltype(d)> du16;
2740 const auto pu16 = reinterpret_cast<const uint16_t*>(p);
2741 return Vec128<float16_t, N>(LoadU(du16, pu16).raw);
2742}
2743template <size_t N>
2745 const bfloat16_t* HWY_RESTRICT p) {
2746 const RebindToUnsigned<decltype(d)> du16;
2747 const auto pu16 = reinterpret_cast<const uint16_t*>(p);
2748 return Vec128<bfloat16_t, N>(LoadU(du16, pu16).raw);
2749}
2750
2751// On ARM, Load is the same as LoadU.
2752template <typename T, size_t N>
2754 return LoadU(d, p);
2755}
2756
2757template <typename T, size_t N>
2759 const T* HWY_RESTRICT aligned) {
2760 return IfThenElseZero(m, Load(d, aligned));
2761}
2762
2763// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
2764template <typename T, size_t N, HWY_IF_LE128(T, N)>
2766 const T* const HWY_RESTRICT p) {
2767 return LoadU(d, p);
2768}
2769
2770// ------------------------------ Store 128
2771
2773 uint8_t* HWY_RESTRICT unaligned) {
2774 vst1q_u8(unaligned, v.raw);
2775}
2777 uint16_t* HWY_RESTRICT unaligned) {
2778 vst1q_u16(unaligned, v.raw);
2779}
2781 uint32_t* HWY_RESTRICT unaligned) {
2782 vst1q_u32(unaligned, v.raw);
2783}
2785 uint64_t* HWY_RESTRICT unaligned) {
2786 vst1q_u64(unaligned, v.raw);
2787}
2789 int8_t* HWY_RESTRICT unaligned) {
2790 vst1q_s8(unaligned, v.raw);
2791}
2793 int16_t* HWY_RESTRICT unaligned) {
2794 vst1q_s16(unaligned, v.raw);
2795}
2797 int32_t* HWY_RESTRICT unaligned) {
2798 vst1q_s32(unaligned, v.raw);
2799}
2801 int64_t* HWY_RESTRICT unaligned) {
2802 vst1q_s64(unaligned, v.raw);
2803}
2805 float* HWY_RESTRICT unaligned) {
2806 vst1q_f32(unaligned, v.raw);
2807}
2808#if HWY_ARCH_ARM_A64
2809HWY_API void StoreU(const Vec128<double> v, Full128<double> /* tag */,
2810 double* HWY_RESTRICT unaligned) {
2811 vst1q_f64(unaligned, v.raw);
2812}
2813#endif
2814
2815// ------------------------------ Store 64
2816
2818 uint8_t* HWY_RESTRICT p) {
2819 vst1_u8(p, v.raw);
2820}
2822 uint16_t* HWY_RESTRICT p) {
2823 vst1_u16(p, v.raw);
2824}
2826 uint32_t* HWY_RESTRICT p) {
2827 vst1_u32(p, v.raw);
2828}
2830 uint64_t* HWY_RESTRICT p) {
2831 vst1_u64(p, v.raw);
2832}
2834 int8_t* HWY_RESTRICT p) {
2835 vst1_s8(p, v.raw);
2836}
2838 int16_t* HWY_RESTRICT p) {
2839 vst1_s16(p, v.raw);
2840}
2842 int32_t* HWY_RESTRICT p) {
2843 vst1_s32(p, v.raw);
2844}
2846 int64_t* HWY_RESTRICT p) {
2847 vst1_s64(p, v.raw);
2848}
2850 float* HWY_RESTRICT p) {
2851 vst1_f32(p, v.raw);
2852}
2853#if HWY_ARCH_ARM_A64
2854HWY_API void StoreU(const Vec64<double> v, Full64<double> /* tag */,
2855 double* HWY_RESTRICT p) {
2856 vst1_f64(p, v.raw);
2857}
2858#endif
2859
2860// ------------------------------ Store 32
2861
2863 uint32_t* HWY_RESTRICT p) {
2864 vst1_lane_u32(p, v.raw, 0);
2865}
2867 int32_t* HWY_RESTRICT p) {
2868 vst1_lane_s32(p, v.raw, 0);
2869}
2871 float* HWY_RESTRICT p) {
2872 vst1_lane_f32(p, v.raw, 0);
2873}
2874
2875template <typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)> // 1 or 2 bytes
2877 const Repartition<uint32_t, decltype(d)> d32;
2878 const uint32_t buf = GetLane(BitCast(d32, v));
2879 CopyBytes<4>(&buf, p);
2880}
2881
2882// ------------------------------ Store 16
2883
2885 uint16_t* HWY_RESTRICT p) {
2886 vst1_lane_u16(p, v.raw, 0);
2887}
2889 int16_t* HWY_RESTRICT p) {
2890 vst1_lane_s16(p, v.raw, 0);
2891}
2892
2893template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2895 const Repartition<uint16_t, decltype(d)> d16;
2896 const uint16_t buf = GetLane(BitCast(d16, v));
2897 CopyBytes<2>(&buf, p);
2898}
2899
2900// ------------------------------ Store 8
2901
2903 uint8_t* HWY_RESTRICT p) {
2904 vst1_lane_u8(p, v.raw, 0);
2905}
2907 int8_t* HWY_RESTRICT p) {
2908 vst1_lane_s8(p, v.raw, 0);
2909}
2910
2911// [b]float16_t use the same Raw as uint16_t, so forward to that.
2912template <size_t N>
2915 const RebindToUnsigned<decltype(d)> du16;
2916 const auto pu16 = reinterpret_cast<uint16_t*>(p);
2917 return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
2918}
2919template <size_t N>
2922 const RebindToUnsigned<decltype(d)> du16;
2923 const auto pu16 = reinterpret_cast<uint16_t*>(p);
2924 return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
2925}
2926
2927HWY_DIAGNOSTICS(push)
2928#if HWY_COMPILER_GCC_ACTUAL
2929 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
2930#endif
2931
2932// On ARM, Store is the same as StoreU.
2933template <typename T, size_t N>
2934HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT aligned) {
2935 StoreU(v, d, aligned);
2936}
2937
2938HWY_DIAGNOSTICS(pop)
2939
2940template <typename T, size_t N>
2942 T* HWY_RESTRICT p) {
2943 // Treat as unsigned so that we correctly support float16.
2944 const RebindToUnsigned<decltype(d)> du;
2945 const auto blended =
2946 IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p)));
2947 StoreU(BitCast(d, blended), d, p);
2948}
2949
2950// ------------------------------ Non-temporal stores
2951
2952// Same as aligned stores on non-x86.
2953
2954template <typename T, size_t N>
2956 T* HWY_RESTRICT aligned) {
2957 Store(v, d, aligned);
2958}
2959
2960// ================================================== CONVERT
2961
2962// ------------------------------ Promotions (part w/ narrow lanes -> full)
2963
2964// Unsigned: zero-extend to full vector.
2966 const Vec64<uint8_t> v) {
2967 return Vec128<uint16_t>(vmovl_u8(v.raw));
2968}
2970 const Vec32<uint8_t> v) {
2971 uint16x8_t a = vmovl_u8(v.raw);
2972 return Vec128<uint32_t>(vmovl_u16(vget_low_u16(a)));
2973}
2975 const Vec64<uint16_t> v) {
2976 return Vec128<uint32_t>(vmovl_u16(v.raw));
2977}
2979 const Vec64<uint32_t> v) {
2980 return Vec128<uint64_t>(vmovl_u32(v.raw));
2981}
2986 uint16x8_t a = vmovl_u8(v.raw);
2987 return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
2988}
2992
2993// Unsigned: zero-extend to half vector.
2994template <size_t N, HWY_IF_LE64(uint16_t, N)>
2996 const Vec128<uint8_t, N> v) {
2997 return Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw)));
2998}
2999template <size_t N, HWY_IF_LE64(uint32_t, N)>
3001 const Vec128<uint8_t, N> v) {
3002 uint16x8_t a = vmovl_u8(v.raw);
3003 return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(vget_low_u16(a))));
3004}
3005template <size_t N>
3007 const Vec128<uint16_t, N> v) {
3008 return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(v.raw)));
3009}
3010template <size_t N, HWY_IF_LE64(uint64_t, N)>
3012 const Vec128<uint32_t, N> v) {
3013 return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw)));
3014}
3015template <size_t N, HWY_IF_LE64(int16_t, N)>
3017 const Vec128<uint8_t, N> v) {
3018 return BitCast(d, Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw))));
3019}
3020template <size_t N, HWY_IF_LE64(int32_t, N)>
3022 const Vec128<uint8_t, N> v) {
3023 uint16x8_t a = vmovl_u8(v.raw);
3024 uint32x4_t b = vmovl_u16(vget_low_u16(a));
3025 return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(b)));
3026}
3027template <size_t N, HWY_IF_LE64(int32_t, N)>
3029 const Vec128<uint16_t, N> v) {
3030 uint32x4_t a = vmovl_u16(v.raw);
3031 return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(a)));
3032}
3033
3034// Signed: replicate sign bit to full vector.
3036 const Vec64<int8_t> v) {
3037 return Vec128<int16_t>(vmovl_s8(v.raw));
3038}
3040 const Vec32<int8_t> v) {
3041 int16x8_t a = vmovl_s8(v.raw);
3042 return Vec128<int32_t>(vmovl_s16(vget_low_s16(a)));
3043}
3045 const Vec64<int16_t> v) {
3046 return Vec128<int32_t>(vmovl_s16(v.raw));
3047}
3049 const Vec64<int32_t> v) {
3050 return Vec128<int64_t>(vmovl_s32(v.raw));
3051}
3052
3053// Signed: replicate sign bit to half vector.
3054template <size_t N>
3056 const Vec128<int8_t, N> v) {
3057 return Vec128<int16_t, N>(vget_low_s16(vmovl_s8(v.raw)));
3058}
3059template <size_t N>
3061 const Vec128<int8_t, N> v) {
3062 int16x8_t a = vmovl_s8(v.raw);
3063 int32x4_t b = vmovl_s16(vget_low_s16(a));
3064 return Vec128<int32_t, N>(vget_low_s32(b));
3065}
3066template <size_t N>
3068 const Vec128<int16_t, N> v) {
3069 return Vec128<int32_t, N>(vget_low_s32(vmovl_s16(v.raw)));
3070}
3071template <size_t N>
3073 const Vec128<int32_t, N> v) {
3074 return Vec128<int64_t, N>(vget_low_s64(vmovl_s32(v.raw)));
3075}
3076
3077#if __ARM_FP & 2
3078
3079HWY_API Vec128<float> PromoteTo(Full128<float> /* tag */,
3080 const Vec128<float16_t, 4> v) {
3081 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
3082 return Vec128<float>(f32);
3083}
3084template <size_t N>
3085HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
3086 const Vec128<float16_t, N> v) {
3087 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
3088 return Vec128<float, N>(vget_low_f32(f32));
3089}
3090
3091#else
3092
3093template <size_t N>
3095 const Vec128<float16_t, N> v) {
3096 const RebindToSigned<decltype(df32)> di32;
3097 const RebindToUnsigned<decltype(df32)> du32;
3098 // Expand to u32 so we can shift.
3099 const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
3100 const auto sign = ShiftRight<15>(bits16);
3101 const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
3102 const auto mantissa = bits16 & Set(du32, 0x3FF);
3103 const auto subnormal =
3104 BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
3105 Set(df32, 1.0f / 16384 / 1024));
3106
3107 const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
3108 const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
3109 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3110 const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
3111 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3112}
3113
3114#endif
3115
3116#if HWY_ARCH_ARM_A64
3117
3118HWY_API Vec128<double> PromoteTo(Full128<double> /* tag */,
3119 const Vec64<float> v) {
3120 return Vec128<double>(vcvt_f64_f32(v.raw));
3121}
3122
3123HWY_API Vec64<double> PromoteTo(Full64<double> /* tag */,
3124 const Vec32<float> v) {
3125 return Vec64<double>(vget_low_f64(vcvt_f64_f32(v.raw)));
3126}
3127
3128HWY_API Vec128<double> PromoteTo(Full128<double> /* tag */,
3129 const Vec64<int32_t> v) {
3130 const int64x2_t i64 = vmovl_s32(v.raw);
3131 return Vec128<double>(vcvtq_f64_s64(i64));
3132}
3133
3134HWY_API Vec64<double> PromoteTo(Full64<double> /* tag */,
3135 const Vec32<int32_t> v) {
3136 const int64x1_t i64 = vget_low_s64(vmovl_s32(v.raw));
3137 return Vec64<double>(vcvt_f64_s64(i64));
3138}
3139
3140#endif
3141
3142// ------------------------------ Demotions (full -> part w/ narrow lanes)
3143
3144// From full vector to half or quarter
3146 const Vec128<int32_t> v) {
3147 return Vec64<uint16_t>(vqmovun_s32(v.raw));
3148}
3150 const Vec128<int32_t> v) {
3151 return Vec64<int16_t>(vqmovn_s32(v.raw));
3152}
3154 const Vec128<int32_t> v) {
3155 const uint16x4_t a = vqmovun_s32(v.raw);
3156 return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a)));
3157}
3159 const Vec128<int16_t> v) {
3160 return Vec64<uint8_t>(vqmovun_s16(v.raw));
3161}
3163 const Vec128<int32_t> v) {
3164 const int16x4_t a = vqmovn_s32(v.raw);
3165 return Vec32<int8_t>(vqmovn_s16(vcombine_s16(a, a)));
3166}
3168 const Vec128<int16_t> v) {
3169 return Vec64<int8_t>(vqmovn_s16(v.raw));
3170}
3171
3172// From half vector to partial half
3173template <size_t N, HWY_IF_LE64(int32_t, N)>
3175 const Vec128<int32_t, N> v) {
3176 return Vec128<uint16_t, N>(vqmovun_s32(vcombine_s32(v.raw, v.raw)));
3177}
3178template <size_t N, HWY_IF_LE64(int32_t, N)>
3180 const Vec128<int32_t, N> v) {
3181 return Vec128<int16_t, N>(vqmovn_s32(vcombine_s32(v.raw, v.raw)));
3182}
3183template <size_t N, HWY_IF_LE64(int32_t, N)>
3185 const Vec128<int32_t, N> v) {
3186 const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw));
3187 return Vec128<uint8_t, N>(vqmovn_u16(vcombine_u16(a, a)));
3188}
3189template <size_t N, HWY_IF_LE64(int16_t, N)>
3191 const Vec128<int16_t, N> v) {
3192 return Vec128<uint8_t, N>(vqmovun_s16(vcombine_s16(v.raw, v.raw)));
3193}
3194template <size_t N, HWY_IF_LE64(int32_t, N)>
3196 const Vec128<int32_t, N> v) {
3197 const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw));
3198 return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(a, a)));
3199}
3200template <size_t N, HWY_IF_LE64(int16_t, N)>
3202 const Vec128<int16_t, N> v) {
3203 return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(v.raw, v.raw)));
3204}
3205
3206#if __ARM_FP & 2
3207
3208HWY_API Vec128<float16_t, 4> DemoteTo(Full64<float16_t> /* tag */,
3209 const Vec128<float> v) {
3210 return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
3211}
3212template <size_t N>
3213HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> /* tag */,
3214 const Vec128<float, N> v) {
3215 const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
3216 return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
3217}
3218
3219#else
3220
3221template <size_t N>
3223 const Vec128<float, N> v) {
3224 const RebindToUnsigned<decltype(df16)> du16;
3225 const Rebind<uint32_t, decltype(du16)> du;
3226 const RebindToSigned<decltype(du)> di;
3227 const auto bits32 = BitCast(du, v);
3228 const auto sign = ShiftRight<31>(bits32);
3229 const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
3230 const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
3231
3232 const auto k15 = Set(di, 15);
3233 const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
3234 const auto is_tiny = exp < Set(di, -24);
3235
3236 const auto is_subnormal = exp < Set(di, -14);
3237 const auto biased_exp16 =
3238 BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
3239 const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
3240 const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
3241 (mantissa32 >> (Set(du, 13) + sub_exp));
3242 const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
3243 ShiftRight<13>(mantissa32)); // <1024
3244
3245 const auto sign16 = ShiftLeft<15>(sign);
3246 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3247 const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
3248 return Vec128<float16_t, N>(DemoteTo(du16, bits16).raw);
3249}
3250
3251#endif
3252
3253template <size_t N>
3255 const Vec128<float, N> v) {
3256 const Rebind<int32_t, decltype(dbf16)> di32;
3257 const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
3258 const Rebind<uint16_t, decltype(dbf16)> du16;
3259 const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
3260 return BitCast(dbf16, DemoteTo(du16, bits_in_32));
3261}
3262
3263#if HWY_ARCH_ARM_A64
3264
3265HWY_API Vec64<float> DemoteTo(Full64<float> /* tag */, const Vec128<double> v) {
3266 return Vec64<float>(vcvt_f32_f64(v.raw));
3267}
3268HWY_API Vec32<float> DemoteTo(Full32<float> /* tag */, const Vec64<double> v) {
3269 return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
3270}
3271
3272HWY_API Vec64<int32_t> DemoteTo(Full64<int32_t> /* tag */,
3273 const Vec128<double> v) {
3274 const int64x2_t i64 = vcvtq_s64_f64(v.raw);
3275 return Vec64<int32_t>(vqmovn_s64(i64));
3276}
3277HWY_API Vec32<int32_t> DemoteTo(Full32<int32_t> /* tag */,
3278 const Vec64<double> v) {
3279 const int64x1_t i64 = vcvt_s64_f64(v.raw);
3280 // There is no i64x1 -> i32x1 narrow, so expand to int64x2_t first.
3281 const int64x2_t i64x2 = vcombine_s64(i64, i64);
3282 return Vec32<int32_t>(vqmovn_s64(i64x2));
3283}
3284
3285#endif
3286
3288 const uint8x16_t org_v = detail::BitCastToByte(v).raw;
3289 const uint8x16_t w = vuzp1q_u8(org_v, org_v);
3290 return Vec32<uint8_t>(vget_low_u8(vuzp1q_u8(w, w)));
3291}
3292template <size_t N, HWY_IF_LE64(uint32_t, N)>
3294 const uint8x8_t org_v = detail::BitCastToByte(v).raw;
3295 const uint8x8_t w = vuzp1_u8(org_v, org_v);
3296 return Vec128<uint8_t, N>(vuzp1_u8(w, w));
3297}
3298
3299// In the following DemoteTo functions, |b| is purposely undefined.
3300// The value a needs to be extended to 128 bits so that vqmovn can be
3301// used and |b| is undefined so that no extra overhead is introduced.
3302HWY_DIAGNOSTICS(push)
3303HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
3304
3305template <size_t N>
3306HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
3307 const Vec128<int32_t> v) {
3310 uint16x8_t c = vcombine_u16(a.raw, b.raw);
3311 return Vec128<uint8_t, N>(vqmovn_u16(c));
3312}
3313
3314template <size_t N>
3316 const Vec128<int32_t> v) {
3319 int16x8_t c = vcombine_s16(a.raw, b.raw);
3320 return Vec128<int8_t, N>(vqmovn_s16(c));
3321}
3322
3323HWY_DIAGNOSTICS(pop)
3324
3325// ------------------------------ Convert integer <=> floating-point
3326
3327HWY_API Vec128<float> ConvertTo(Full128<float> /* tag */,
3328 const Vec128<int32_t> v) {
3329 return Vec128<float>(vcvtq_f32_s32(v.raw));
3330}
3331template <size_t N, HWY_IF_LE64(int32_t, N)>
3333 const Vec128<int32_t, N> v) {
3334 return Vec128<float, N>(vcvt_f32_s32(v.raw));
3335}
3336
3338 const Vec128<uint32_t> v) {
3339 return Vec128<float>(vcvtq_f32_u32(v.raw));
3340}
3341template <size_t N, HWY_IF_LE64(uint32_t, N)>
3343 const Vec128<uint32_t, N> v) {
3344 return Vec128<float, N>(vcvt_f32_u32(v.raw));
3345}
3346
3347// Truncates (rounds toward zero).
3349 const Vec128<float> v) {
3350 return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
3351}
3352template <size_t N, HWY_IF_LE64(float, N)>
3354 const Vec128<float, N> v) {
3355 return Vec128<int32_t, N>(vcvt_s32_f32(v.raw));
3356}
3357
3358#if HWY_ARCH_ARM_A64
3359
3360HWY_API Vec128<double> ConvertTo(Full128<double> /* tag */,
3361 const Vec128<int64_t> v) {
3362 return Vec128<double>(vcvtq_f64_s64(v.raw));
3363}
3364HWY_API Vec64<double> ConvertTo(Full64<double> /* tag */,
3365 const Vec64<int64_t> v) {
3366 return Vec64<double>(vcvt_f64_s64(v.raw));
3367}
3368
3369HWY_API Vec128<double> ConvertTo(Full128<double> /* tag */,
3370 const Vec128<uint64_t> v) {
3371 return Vec128<double>(vcvtq_f64_u64(v.raw));
3372}
3373HWY_API Vec64<double> ConvertTo(Full64<double> /* tag */,
3374 const Vec64<uint64_t> v) {
3375 return Vec64<double>(vcvt_f64_u64(v.raw));
3376}
3377
3378// Truncates (rounds toward zero).
3379HWY_API Vec128<int64_t> ConvertTo(Full128<int64_t> /* tag */,
3380 const Vec128<double> v) {
3381 return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
3382}
3383HWY_API Vec64<int64_t> ConvertTo(Full64<int64_t> /* tag */,
3384 const Vec64<double> v) {
3385 return Vec64<int64_t>(vcvt_s64_f64(v.raw));
3386}
3387
3388#endif
3389
3390// ------------------------------ Round (IfThenElse, mask, logical)
3391
3392#if HWY_ARCH_ARM_A64
3393// Toward nearest integer
3394HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1)
3395
3396// Toward zero, aka truncate
3397HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Trunc, vrnd, _, 1)
3398
3399// Toward +infinity, aka ceiling
3400HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Ceil, vrndp, _, 1)
3401
3402// Toward -infinity, aka floor
3403HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor, vrndm, _, 1)
3404#else
3405
3406// ------------------------------ Trunc
3407
3408// ARMv7 only supports truncation to integer. We can either convert back to
3409// float (3 floating-point and 2 logic operations) or manipulate the binary32
3410// representation, clearing the lowest 23-exp mantissa bits. This requires 9
3411// integer operations and 3 constants, which is likely more expensive.
3412
3413namespace detail {
3414
3415// The original value is already the desired result if NaN or the magnitude is
3416// large (i.e. the value is already an integer).
3417template <size_t N>
3419 return Abs(v) < Set(Simd<float, N, 0>(), MantissaEnd<float>());
3420}
3421
3422} // namespace detail
3423
3424template <size_t N>
3426 const DFromV<decltype(v)> df;
3427 const RebindToSigned<decltype(df)> di;
3428
3429 const auto integer = ConvertTo(di, v); // round toward 0
3430 const auto int_f = ConvertTo(df, integer);
3431
3432 return IfThenElse(detail::UseInt(v), int_f, v);
3433}
3434
3435template <size_t N>
3437 const DFromV<decltype(v)> df;
3438
3439 // ARMv7 also lacks a native NearestInt, but we can instead rely on rounding
3440 // (we assume the current mode is nearest-even) after addition with a large
3441 // value such that no mantissa bits remain. We may need a compiler flag for
3442 // precise floating-point to prevent this from being "optimized" out.
3443 const auto max = Set(df, MantissaEnd<float>());
3444 const auto large = CopySignToAbs(max, v);
3445 const auto added = large + v;
3446 const auto rounded = added - large;
3447
3448 // Keep original if NaN or the magnitude is large (already an int).
3449 return IfThenElse(Abs(v) < max, rounded, v);
3450}
3451
3452template <size_t N>
3454 const DFromV<decltype(v)> df;
3455 const RebindToSigned<decltype(df)> di;
3456
3457 const auto integer = ConvertTo(di, v); // round toward 0
3458 const auto int_f = ConvertTo(df, integer);
3459
3460 // Truncating a positive non-integer ends up smaller; if so, add 1.
3461 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
3462
3463 return IfThenElse(detail::UseInt(v), int_f - neg1, v);
3464}
3465
3466template <size_t N>
3468 const DFromV<decltype(v)> df;
3469 const RebindToSigned<decltype(df)> di;
3470
3471 const auto integer = ConvertTo(di, v); // round toward 0
3472 const auto int_f = ConvertTo(df, integer);
3473
3474 // Truncating a negative non-integer ends up larger; if so, subtract 1.
3475 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
3476
3477 return IfThenElse(detail::UseInt(v), int_f + neg1, v);
3478}
3479
3480#endif
3481
3482// ------------------------------ NearestInt (Round)
3483
3484#if HWY_ARCH_ARM_A64
3485
3486HWY_API Vec128<int32_t> NearestInt(const Vec128<float> v) {
3487 return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
3488}
3489template <size_t N, HWY_IF_LE64(float, N)>
3490HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
3491 return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
3492}
3493
3494#else
3495
3496template <size_t N>
3498 const RebindToSigned<DFromV<decltype(v)>> di;
3499 return ConvertTo(di, Round(v));
3500}
3501
3502#endif
3503
3504// ------------------------------ Floating-point classification
3505template <typename T, size_t N>
3507 return v != v;
3508}
3509
3510template <typename T, size_t N, HWY_IF_FLOAT(T)>
3512 const Simd<T, N, 0> d;
3513 const RebindToSigned<decltype(d)> di;
3514 const VFromD<decltype(di)> vi = BitCast(di, v);
3515 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
3516 return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
3517}
3518
3519// Returns whether normal/subnormal/zero.
3520template <typename T, size_t N, HWY_IF_FLOAT(T)>
3522 const Simd<T, N, 0> d;
3523 const RebindToUnsigned<decltype(d)> du;
3524 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
3525 const VFromD<decltype(du)> vu = BitCast(du, v);
3526 // 'Shift left' to clear the sign bit, then right so we can compare with the
3527 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
3528 // negative and non-negative floats would be greater).
3529 const VFromD<decltype(di)> exp =
3530 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
3531 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
3532}
3533
3534// ================================================== SWIZZLE
3535
3536// ------------------------------ LowerHalf
3537
3538// <= 64 bit: just return different type
3539template <typename T, size_t N, HWY_IF_LE64(uint8_t, N)>
3541 return Vec128<T, N / 2>(v.raw);
3542}
3543
3545 return Vec64<uint8_t>(vget_low_u8(v.raw));
3546}
3548 return Vec64<uint16_t>(vget_low_u16(v.raw));
3549}
3551 return Vec64<uint32_t>(vget_low_u32(v.raw));
3552}
3554 return Vec64<uint64_t>(vget_low_u64(v.raw));
3555}
3557 return Vec64<int8_t>(vget_low_s8(v.raw));
3558}
3560 return Vec64<int16_t>(vget_low_s16(v.raw));
3561}
3563 return Vec64<int32_t>(vget_low_s32(v.raw));
3564}
3566 return Vec64<int64_t>(vget_low_s64(v.raw));
3567}
3569 return Vec64<float>(vget_low_f32(v.raw));
3570}
3571#if HWY_ARCH_ARM_A64
3572HWY_API Vec64<double> LowerHalf(const Vec128<double> v) {
3573 return Vec64<double>(vget_low_f64(v.raw));
3574}
3575#endif
3577 const Full128<uint16_t> du;
3578 const Full64<bfloat16_t> dbh;
3579 return BitCast(dbh, LowerHalf(BitCast(du, v)));
3580}
3581
3582template <typename T, size_t N>
3584 Vec128<T, N> v) {
3585 return LowerHalf(v);
3586}
3587
3588// ------------------------------ CombineShiftRightBytes
3589
3590// 128-bit
3591template <int kBytes, typename T, class V128 = Vec128<T>>
3592HWY_API V128 CombineShiftRightBytes(Full128<T> d, V128 hi, V128 lo) {
3593 static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]");
3594 const Repartition<uint8_t, decltype(d)> d8;
3595 uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
3596 return BitCast(d, Vec128<uint8_t>(v8));
3597}
3598
3599// 64-bit
3600template <int kBytes, typename T>
3602 static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]");
3603 const Repartition<uint8_t, decltype(d)> d8;
3604 uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
3605 return BitCast(d, VFromD<decltype(d8)>(v8));
3606}
3607
3608// <= 32-bit defined after ShiftLeftBytes.
3609
3610// ------------------------------ Shift vector by constant #bytes
3611
3612namespace detail {
3613
3614// Partially specialize because kBytes = 0 and >= size are compile errors;
3615// callers replace the latter with 0xFF for easier specialization.
3616template <int kBytes>
3618 // Full
3619 template <class T>
3621 const Full128<T> d;
3622 return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d));
3623 }
3624
3625 // Partial
3626 template <class T, size_t N, HWY_IF_LE64(T, N)>
3628 // Expand to 64-bit so we only use the native EXT instruction.
3629 const Full64<T> d64;
3630 const auto zero64 = Zero(d64);
3631 const decltype(zero64) v64(v.raw);
3632 return Vec128<T, N>(
3633 CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
3634 }
3635};
3636template <>
3638 template <class T, size_t N>
3640 return v;
3641 }
3642};
3643template <>
3644struct ShiftLeftBytesT<0xFF> {
3645 template <class T, size_t N>
3647 return Zero(Simd<T, N, 0>());
3648 }
3649};
3650
3651template <int kBytes>
3653 template <class T, size_t N>
3655 const Simd<T, N, 0> d;
3656 // For < 64-bit vectors, zero undefined lanes so we shift in zeros.
3657 if (N * sizeof(T) < 8) {
3658 constexpr size_t kReg = N * sizeof(T) == 16 ? 16 : 8;
3659 const Simd<T, kReg / sizeof(T), 0> dreg;
3660 v = Vec128<T, N>(
3661 IfThenElseZero(FirstN(dreg, N), VFromD<decltype(dreg)>(v.raw)).raw);
3662 }
3663 return CombineShiftRightBytes<kBytes>(d, Zero(d), v);
3664 }
3665};
3666template <>
3668 template <class T, size_t N>
3670 return v;
3671 }
3672};
3673template <>
3674struct ShiftRightBytesT<0xFF> {
3675 template <class T, size_t N>
3677 return Zero(Simd<T, N, 0>());
3678 }
3679};
3680
3681} // namespace detail
3682
3683template <int kBytes, typename T, size_t N>
3685 return detail::ShiftLeftBytesT < kBytes >= N * sizeof(T) ? 0xFF
3686 : kBytes > ()(v);
3687}
3688
3689template <int kBytes, typename T, size_t N>
3691 return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(), v);
3692}
3693
3694template <int kLanes, typename T, size_t N>
3696 const Repartition<uint8_t, decltype(d)> d8;
3697 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3698}
3699
3700template <int kLanes, typename T, size_t N>
3702 return ShiftLeftLanes<kLanes>(Simd<T, N, 0>(), v);
3703}
3704
3705// 0x01..0F, kBytes = 1 => 0x0001..0E
3706template <int kBytes, typename T, size_t N>
3708 return detail::ShiftRightBytesT < kBytes >= N * sizeof(T) ? 0xFF
3709 : kBytes > ()(v);
3710}
3711
3712template <int kLanes, typename T, size_t N>
3714 const Repartition<uint8_t, decltype(d)> d8;
3715 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
3716}
3717
3718// Calls ShiftLeftBytes
3719template <int kBytes, typename T, size_t N, HWY_IF_LE32(T, N)>
3721 Vec128<T, N> lo) {
3722 constexpr size_t kSize = N * sizeof(T);
3723 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
3724 const Repartition<uint8_t, decltype(d)> d8;
3725 const Full64<uint8_t> d_full8;
3726 const Repartition<T, decltype(d_full8)> d_full;
3727 using V64 = VFromD<decltype(d_full8)>;
3728 const V64 hi64(BitCast(d8, hi).raw);
3729 // Move into most-significant bytes
3730 const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw));
3731 const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64);
3732 // After casting to full 64-bit vector of correct type, shrink to 32-bit
3733 return Vec128<T, N>(BitCast(d_full, r).raw);
3734}
3735
3736// ------------------------------ UpperHalf (ShiftRightBytes)
3737
3738// Full input
3740 const Vec128<uint8_t> v) {
3741 return Vec64<uint8_t>(vget_high_u8(v.raw));
3742}
3744 const Vec128<uint16_t> v) {
3745 return Vec64<uint16_t>(vget_high_u16(v.raw));
3746}
3748 const Vec128<uint32_t> v) {
3749 return Vec64<uint32_t>(vget_high_u32(v.raw));
3750}
3752 const Vec128<uint64_t> v) {
3753 return Vec64<uint64_t>(vget_high_u64(v.raw));
3754}
3756 const Vec128<int8_t> v) {
3757 return Vec64<int8_t>(vget_high_s8(v.raw));
3758}
3760 const Vec128<int16_t> v) {
3761 return Vec64<int16_t>(vget_high_s16(v.raw));
3762}
3764 const Vec128<int32_t> v) {
3765 return Vec64<int32_t>(vget_high_s32(v.raw));
3766}
3768 const Vec128<int64_t> v) {
3769 return Vec64<int64_t>(vget_high_s64(v.raw));
3770}
3772 return Vec64<float>(vget_high_f32(v.raw));
3773}
3774#if HWY_ARCH_ARM_A64
3775HWY_API Vec64<double> UpperHalf(Full64<double> /* tag */,
3776 const Vec128<double> v) {
3777 return Vec64<double>(vget_high_f64(v.raw));
3778}
3779#endif
3780
3782 const Vec128<bfloat16_t> v) {
3783 const RebindToUnsigned<decltype(dbh)> duh;
3784 const Twice<decltype(duh)> du;
3785 return BitCast(dbh, UpperHalf(duh, BitCast(du, v)));
3786}
3787
3788// Partial
3789template <typename T, size_t N, HWY_IF_LE64(T, N)>
3790HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
3791 Vec128<T, N> v) {
3792 const DFromV<decltype(v)> d;
3793 const RebindToUnsigned<decltype(d)> du;
3794 const auto vu = BitCast(du, v);
3795 const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
3796 return Vec128<T, (N + 1) / 2>(upper.raw);
3797}
3798
3799// ------------------------------ Broadcast/splat any lane
3800
3801#if HWY_ARCH_ARM_A64
3802// Unsigned
3803template <int kLane>
3804HWY_API Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
3805 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3806 return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
3807}
3808template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
3809HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
3810 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3811 return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
3812}
3813template <int kLane>
3814HWY_API Vec128<uint32_t> Broadcast(const Vec128<uint32_t> v) {
3815 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3816 return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane));
3817}
3818template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
3819HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
3820 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3821 return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
3822}
3823template <int kLane>
3824HWY_API Vec128<uint64_t> Broadcast(const Vec128<uint64_t> v) {
3825 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3826 return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane));
3827}
3828// Vec64<uint64_t> is defined below.
3829
3830// Signed
3831template <int kLane>
3832HWY_API Vec128<int16_t> Broadcast(const Vec128<int16_t> v) {
3833 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3834 return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane));
3835}
3836template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
3837HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
3838 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3839 return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
3840}
3841template <int kLane>
3842HWY_API Vec128<int32_t> Broadcast(const Vec128<int32_t> v) {
3843 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3844 return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane));
3845}
3846template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
3847HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
3848 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3849 return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
3850}
3851template <int kLane>
3852HWY_API Vec128<int64_t> Broadcast(const Vec128<int64_t> v) {
3853 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3854 return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane));
3855}
3856// Vec64<int64_t> is defined below.
3857
3858// Float
3859template <int kLane>
3860HWY_API Vec128<float> Broadcast(const Vec128<float> v) {
3861 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3862 return Vec128<float>(vdupq_laneq_f32(v.raw, kLane));
3863}
3864template <int kLane, size_t N, HWY_IF_LE64(float, N)>
3865HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
3866 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3867 return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
3868}
3869template <int kLane>
3870HWY_API Vec128<double> Broadcast(const Vec128<double> v) {
3871 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3872 return Vec128<double>(vdupq_laneq_f64(v.raw, kLane));
3873}
3874template <int kLane>
3875HWY_API Vec64<double> Broadcast(const Vec64<double> v) {
3876 static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3877 return v;
3878}
3879
3880#else
3881// No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*.
3882
3883// Unsigned
3884template <int kLane>
3886 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3887 return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane)));
3888}
3889template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
3891 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3892 return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
3893}
3894template <int kLane>
3896 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3897 return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane)));
3898}
3899template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
3901 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3902 return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
3903}
3904template <int kLane>
3906 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3907 return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane)));
3908}
3909// Vec64<uint64_t> is defined below.
3910
3911// Signed
3912template <int kLane>
3914 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3915 return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane)));
3916}
3917template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
3919 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3920 return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
3921}
3922template <int kLane>
3924 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3925 return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane)));
3926}
3927template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
3929 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3930 return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
3931}
3932template <int kLane>
3934 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3935 return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane)));
3936}
3937// Vec64<int64_t> is defined below.
3938
3939// Float
3940template <int kLane>
3942 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3943 return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane)));
3944}
3945template <int kLane, size_t N, HWY_IF_LE64(float, N)>
3947 static_assert(0 <= kLane && kLane < N, "Invalid lane");
3948 return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
3949}
3950
3951#endif
3952
3953template <int kLane>
3955 static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3956 return v;
3957}
3958template <int kLane>
3960 static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3961 return v;
3962}
3963
3964// ------------------------------ TableLookupLanes
3965
3966// Returned by SetTableIndices for use by TableLookupLanes.
3967template <typename T, size_t N>
3971
3972template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
3974 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
3975#if HWY_IS_DEBUG_BUILD
3976 const Rebind<TI, decltype(d)> di;
3977 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
3978 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
3979#endif
3980
3981 const Repartition<uint8_t, decltype(d)> d8;
3982 using V8 = VFromD<decltype(d8)>;
3983 const Repartition<uint16_t, decltype(d)> d16;
3984
3985 // Broadcast each lane index to all bytes of T and shift to bytes
3986 static_assert(sizeof(T) == 4 || sizeof(T) == 8, "");
3987 if (sizeof(T) == 4) {
3988 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3989 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
3990 const V8 lane_indices =
3991 TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
3992 const V8 byte_indices =
3993 BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
3994 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
3995 0, 1, 2, 3, 0, 1, 2, 3};
3996 const V8 sum = Add(byte_indices, Load(d8, kByteOffsets));
3997 return Indices128<T, N>{BitCast(d, sum).raw};
3998 } else {
3999 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
4000 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
4001 const V8 lane_indices =
4002 TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
4003 const V8 byte_indices =
4004 BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices)));
4005 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
4006 0, 1, 2, 3, 4, 5, 6, 7};
4007 const V8 sum = Add(byte_indices, Load(d8, kByteOffsets));
4008 return Indices128<T, N>{BitCast(d, sum).raw};
4009 }
4010}
4011
4012template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
4014 const Rebind<TI, decltype(d)> di;
4015 return IndicesFromVec(d, LoadU(di, idx));
4016}
4017
4018template <typename T, size_t N>
4020 const DFromV<decltype(v)> d;
4021 const RebindToSigned<decltype(d)> di;
4022 return BitCast(
4023 d, TableLookupBytes(BitCast(di, v), BitCast(di, Vec128<T, N>{idx.raw})));
4024}
4025
4026// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
4027
4028// Single lane: no change
4029template <typename T>
4031 return v;
4032}
4033
4034// Two lanes: shuffle
4035template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4039
4040template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4042 return Shuffle01(v);
4043}
4044
4045// Four lanes: shuffle
4046template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4047HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
4048 return Shuffle0123(v);
4049}
4050
4051// 16-bit
4052template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4054 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
4055 return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
4056}
4057
4058// ------------------------------ Reverse2
4059
4060template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
4062 const RebindToUnsigned<decltype(d)> du;
4063 return BitCast(d, Vec128<uint16_t, N>(vrev32_u16(BitCast(du, v).raw)));
4064}
4065template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4067 const RebindToUnsigned<decltype(d)> du;
4068 return BitCast(d, Vec128<uint16_t>(vrev32q_u16(BitCast(du, v).raw)));
4069}
4070
4071template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE64(T, N)>
4072HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
4073 const RebindToUnsigned<decltype(d)> du;
4074 return BitCast(d, Vec128<uint32_t, N>(vrev64_u32(BitCast(du, v).raw)));
4075}
4076template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4077HWY_API Vec128<T> Reverse2(Full128<T> d, const Vec128<T> v) {
4078 const RebindToUnsigned<decltype(d)> du;
4079 return BitCast(d, Vec128<uint32_t>(vrev64q_u32(BitCast(du, v).raw)));
4080}
4081
4082template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4084 return Shuffle01(v);
4085}
4086
4087// ------------------------------ Reverse4
4088
4089template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
4091 const RebindToUnsigned<decltype(d)> du;
4092 return BitCast(d, Vec128<uint16_t, N>(vrev64_u16(BitCast(du, v).raw)));
4093}
4094template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4096 const RebindToUnsigned<decltype(d)> du;
4097 return BitCast(d, Vec128<uint16_t>(vrev64q_u16(BitCast(du, v).raw)));
4098}
4099
4100template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4102 return Shuffle0123(v);
4103}
4104
4105template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4106HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N>) {
4107 HWY_ASSERT(0); // don't have 8 u64 lanes
4108}
4109
4110// ------------------------------ Reverse8
4111
4112template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4116
4117template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
4118HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>, const Vec128<T, N>) {
4119 HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
4120}
4121
4122// ------------------------------ Other shuffles (TableLookupBytes)
4123
4124// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
4125// Shuffle0321 rotates one lane to the right (the previous least-significant
4126// lane is now most-significant). These could also be implemented via
4127// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
4128
4129// Swap 64-bit halves
4130template <typename T>
4132 return CombineShiftRightBytes<8>(Full128<T>(), v, v);
4133}
4134template <typename T>
4136 return CombineShiftRightBytes<8>(Full128<T>(), v, v);
4137}
4138
4139// Rotate right 32 bits
4140template <typename T>
4142 return CombineShiftRightBytes<4>(Full128<T>(), v, v);
4143}
4144
4145// Rotate left 32 bits
4146template <typename T>
4148 return CombineShiftRightBytes<12>(Full128<T>(), v, v);
4149}
4150
4151// Reverse
4152template <typename T>
4156
4157// ------------------------------ InterleaveLower
4158
4159// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
4160// the least-significant lane) and "b". To concatenate two half-width integers
4161// into one, use ZipLower/Upper instead (also works with scalar).
4162HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveLower, vzip1, _, 2)
4163HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveLower, vzip1, _, 2)
4164
4165#if HWY_ARCH_ARM_A64
4166// N=1 makes no sense (in that case, there would be no upper/lower).
4167HWY_API Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
4168 const Vec128<uint64_t> b) {
4169 return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
4170}
4171HWY_API Vec128<int64_t> InterleaveLower(const Vec128<int64_t> a,
4172 const Vec128<int64_t> b) {
4173 return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
4174}
4175HWY_API Vec128<double> InterleaveLower(const Vec128<double> a,
4176 const Vec128<double> b) {
4177 return Vec128<double>(vzip1q_f64(a.raw, b.raw));
4178}
4179#else
4180// ARMv7 emulation.
4182 const Vec128<uint64_t> b) {
4183 return CombineShiftRightBytes<8>(Full128<uint64_t>(), b, Shuffle01(a));
4184}
4186 const Vec128<int64_t> b) {
4187 return CombineShiftRightBytes<8>(Full128<int64_t>(), b, Shuffle01(a));
4188}
4189#endif
4190
4191// Floats
4193 const Vec128<float> b) {
4194 return Vec128<float>(vzip1q_f32(a.raw, b.raw));
4195}
4196template <size_t N, HWY_IF_LE64(float, N)>
4198 const Vec128<float, N> b) {
4199 return Vec128<float, N>(vzip1_f32(a.raw, b.raw));
4200}
4201
4202// < 64 bit parts
4203template <typename T, size_t N, HWY_IF_LE32(T, N)>
4207
4208// Additional overload for the optional Simd<> tag.
4209template <typename T, size_t N, class V = Vec128<T, N>>
4210HWY_API V InterleaveLower(Simd<T, N, 0> /* tag */, V a, V b) {
4211 return InterleaveLower(a, b);
4212}
4213
4214// ------------------------------ InterleaveUpper (UpperHalf)
4215
4216// All functions inside detail lack the required D parameter.
4217namespace detail {
4218HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2)
4219HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2)
4220
4221#if HWY_ARCH_ARM_A64
4222// N=1 makes no sense (in that case, there would be no upper/lower).
4223HWY_API Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a,
4224 const Vec128<uint64_t> b) {
4225 return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
4226}
4227HWY_API Vec128<int64_t> InterleaveUpper(Vec128<int64_t> a, Vec128<int64_t> b) {
4228 return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
4229}
4230HWY_API Vec128<double> InterleaveUpper(Vec128<double> a, Vec128<double> b) {
4231 return Vec128<double>(vzip2q_f64(a.raw, b.raw));
4232}
4233#else
4234// ARMv7 emulation.
4236 const Vec128<uint64_t> b) {
4237 return CombineShiftRightBytes<8>(Full128<uint64_t>(), Shuffle01(b), a);
4238}
4240 return CombineShiftRightBytes<8>(Full128<int64_t>(), Shuffle01(b), a);
4241}
4242#endif
4243
4248 const Vec64<float> b) {
4249 return Vec64<float>(vzip2_f32(a.raw, b.raw));
4250}
4251
4252} // namespace detail
4253
4254// Full register
4255template <typename T, size_t N, HWY_IF_GE64(T, N), class V = Vec128<T, N>>
4256HWY_API V InterleaveUpper(Simd<T, N, 0> /* tag */, V a, V b) {
4257 return detail::InterleaveUpper(a, b);
4258}
4259
4260// Partial
4261template <typename T, size_t N, HWY_IF_LE32(T, N), class V = Vec128<T, N>>
4262HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
4263 const Half<decltype(d)> d2;
4264 return InterleaveLower(d, V(UpperHalf(d2, a).raw), V(UpperHalf(d2, b).raw));
4265}
4266
4267// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
4268
4269// Same as Interleave*, except that the return lanes are double-width integers;
4270// this is necessary because the single-lane scalar cannot return two values.
4271template <class V, class DW = RepartitionToWide<DFromV<V>>>
4273 return BitCast(DW(), InterleaveLower(a, b));
4274}
4275template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
4276HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
4277 return BitCast(dw, InterleaveLower(D(), a, b));
4278}
4279
4280template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
4281HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
4282 return BitCast(dw, InterleaveUpper(D(), a, b));
4283}
4284
4285// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
4286
4287template <size_t N>
4291 const Vec128<float, N> sum0,
4292 Vec128<float, N>& sum1) {
4293 const Rebind<uint32_t, decltype(df32)> du32;
4294 using VU32 = VFromD<decltype(du32)>;
4295 const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
4296 // Avoid ZipLower/Upper so this also works on big-endian systems.
4297 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
4298 const VU32 ao = And(BitCast(du32, a), odd);
4299 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
4300 const VU32 bo = And(BitCast(du32, b), odd);
4301 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
4302 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
4303}
4304
4308 const Vec128<int32_t> sum0,
4309 Vec128<int32_t>& sum1) {
4310#if HWY_ARCH_ARM_A64
4311 sum1 = Vec128<int32_t>(vmlal_high_s16(sum1.raw, a.raw, b.raw));
4312#else
4313 const Full64<int16_t> dh;
4314 sum1 = Vec128<int32_t>(
4315 vmlal_s16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
4316#endif
4317 return Vec128<int32_t>(
4318 vmlal_s16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw));
4319}
4320
4324 const Vec64<int32_t> sum0,
4325 Vec64<int32_t>& sum1) {
4326 // vmlal writes into the upper half, which the caller cannot use, so
4327 // split into two halves.
4328 const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw));
4329 const Vec64<int32_t> mul_32 = UpperHalf(d32, mul_3210);
4330 sum1 += mul_32;
4331 return sum0 + LowerHalf(mul_3210);
4332}
4333
4337 const Vec32<int32_t> sum0,
4338 Vec32<int32_t>& sum1) {
4339 const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw));
4340 const Vec64<int32_t> mul_10(LowerHalf(mul_xx10));
4341 const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10);
4342 const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10);
4343 sum1 += mul1;
4344 return sum0 + mul0;
4345}
4346
4347// ================================================== COMBINE
4348
4349// ------------------------------ Combine (InterleaveLower)
4350
4351// Full result
4353 Vec64<uint8_t> lo) {
4354 return Vec128<uint8_t>(vcombine_u8(lo.raw, hi.raw));
4355}
4358 return Vec128<uint16_t>(vcombine_u16(lo.raw, hi.raw));
4359}
4362 return Vec128<uint32_t>(vcombine_u32(lo.raw, hi.raw));
4363}
4366 return Vec128<uint64_t>(vcombine_u64(lo.raw, hi.raw));
4367}
4368
4370 Vec64<int8_t> lo) {
4371 return Vec128<int8_t>(vcombine_s8(lo.raw, hi.raw));
4372}
4374 Vec64<int16_t> lo) {
4375 return Vec128<int16_t>(vcombine_s16(lo.raw, hi.raw));
4376}
4378 Vec64<int32_t> lo) {
4379 return Vec128<int32_t>(vcombine_s32(lo.raw, hi.raw));
4380}
4382 Vec64<int64_t> lo) {
4383 return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
4384}
4385
4387 Vec64<float> lo) {
4388 return Vec128<float>(vcombine_f32(lo.raw, hi.raw));
4389}
4390#if HWY_ARCH_ARM_A64
4391HWY_API Vec128<double> Combine(Full128<double> /* tag */, Vec64<double> hi,
4392 Vec64<double> lo) {
4393 return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
4394}
4395#endif
4396
4397// < 64bit input, <= 64 bit result
4398template <typename T, size_t N, HWY_IF_LE64(T, N)>
4400 Vec128<T, N / 2> lo) {
4401 // First double N (only lower halves will be used).
4402 const Vec128<T, N> hi2(hi.raw);
4403 const Vec128<T, N> lo2(lo.raw);
4404 // Repartition to two unsigned lanes (each the size of the valid input).
4405 const Simd<UnsignedFromSize<N * sizeof(T) / 2>, 2, 0> du;
4406 return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2)));
4407}
4408
4409// ------------------------------ RearrangeToOddPlusEven (Combine)
4410
4411template <size_t N>
4413 const Vec128<float, N> sum1) {
4414 return Add(sum0, sum1);
4415}
4416
4418 const Vec128<int32_t> sum1) {
4419// vmlal_s16 multiplied the lower half into sum0 and upper into sum1.
4420#if HWY_ARCH_ARM_A64 // pairwise sum is available and what we want
4421 return Vec128<int32_t>(vpaddq_s32(sum0.raw, sum1.raw));
4422#else
4423 const Full128<int32_t> d;
4424 const Half<decltype(d)> d64;
4425 const Vec64<int32_t> hi(
4426 vpadd_s32(LowerHalf(d64, sum1).raw, UpperHalf(d64, sum1).raw));
4427 const Vec64<int32_t> lo(
4428 vpadd_s32(LowerHalf(d64, sum0).raw, UpperHalf(d64, sum0).raw));
4429 return Combine(Full128<int32_t>(), hi, lo);
4430#endif
4431}
4432
4434 const Vec64<int32_t> sum1) {
4435 // vmlal_s16 multiplied the lower half into sum0 and upper into sum1.
4436 return Vec64<int32_t>(vpadd_s32(sum0.raw, sum1.raw));
4437}
4438
4440 const Vec32<int32_t> sum1) {
4441 // Only one widened sum per register, so add them for sum of odd and even.
4442 return sum0 + sum1;
4443}
4444
4445// ------------------------------ ZeroExtendVector (Combine)
4446
4447template <typename T, size_t N>
4449 return Combine(d, Zero(Half<decltype(d)>()), lo);
4450}
4451
4452// ------------------------------ ConcatLowerLower
4453
4454// 64 or 128-bit input: just interleave
4455template <typename T, size_t N, HWY_IF_GE64(T, N)>
4457 Vec128<T, N> lo) {
4458 // Treat half-width input as a single lane and interleave them.
4459 const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
4460 return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi)));
4461}
4462
4463namespace detail {
4464#if HWY_ARCH_ARM_A64
4465HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveEven, vtrn1, _, 2)
4466HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveOdd, vtrn2, _, 2)
4467#else
4468
4469// vtrn returns a struct with even and odd result.
4470#define HWY_NEON_BUILD_TPL_HWY_TRN
4471#define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
4472// Pass raw args so we can accept uint16x2 args, for which there is no
4473// corresponding uint16x2x2 return type.
4474#define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
4475 Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
4476#define HWY_NEON_BUILD_ARG_HWY_TRN a, b
4477
4478// Cannot use UINT8 etc. type macros because the x2_t tuples are only defined
4479// for full and half vectors.
4480HWY_NEON_DEF_FUNCTION(uint8, 16, InterleaveEvenOdd, vtrnq, _, u8, HWY_TRN)
4481HWY_NEON_DEF_FUNCTION(uint8, 8, InterleaveEvenOdd, vtrn, _, u8, HWY_TRN)
4482HWY_NEON_DEF_FUNCTION(uint16, 8, InterleaveEvenOdd, vtrnq, _, u16, HWY_TRN)
4483HWY_NEON_DEF_FUNCTION(uint16, 4, InterleaveEvenOdd, vtrn, _, u16, HWY_TRN)
4484HWY_NEON_DEF_FUNCTION(uint32, 4, InterleaveEvenOdd, vtrnq, _, u32, HWY_TRN)
4485HWY_NEON_DEF_FUNCTION(uint32, 2, InterleaveEvenOdd, vtrn, _, u32, HWY_TRN)
4486HWY_NEON_DEF_FUNCTION(int8, 16, InterleaveEvenOdd, vtrnq, _, s8, HWY_TRN)
4487HWY_NEON_DEF_FUNCTION(int8, 8, InterleaveEvenOdd, vtrn, _, s8, HWY_TRN)
4488HWY_NEON_DEF_FUNCTION(int16, 8, InterleaveEvenOdd, vtrnq, _, s16, HWY_TRN)
4489HWY_NEON_DEF_FUNCTION(int16, 4, InterleaveEvenOdd, vtrn, _, s16, HWY_TRN)
4490HWY_NEON_DEF_FUNCTION(int32, 4, InterleaveEvenOdd, vtrnq, _, s32, HWY_TRN)
4491HWY_NEON_DEF_FUNCTION(int32, 2, InterleaveEvenOdd, vtrn, _, s32, HWY_TRN)
4492HWY_NEON_DEF_FUNCTION(float32, 4, InterleaveEvenOdd, vtrnq, _, f32, HWY_TRN)
4493HWY_NEON_DEF_FUNCTION(float32, 2, InterleaveEvenOdd, vtrn, _, f32, HWY_TRN)
4494#endif
4495} // namespace detail
4496
4497// <= 32-bit input/output
4498template <typename T, size_t N, HWY_IF_LE32(T, N)>
4499HWY_API Vec128<T, N> ConcatLowerLower(const Simd<T, N, 0> d, Vec128<T, N> hi,
4500 Vec128<T, N> lo) {
4501 // Treat half-width input as two lanes and take every second one.
4502 const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
4503#if HWY_ARCH_ARM_A64
4504 return BitCast(d, detail::InterleaveEven(BitCast(du, lo), BitCast(du, hi)));
4505#else
4506 using VU = VFromD<decltype(du)>;
4507 return BitCast(
4508 d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
4509 .val[0]));
4510#endif
4511}
4512
4513// ------------------------------ ConcatUpperUpper
4514
4515// 64 or 128-bit input: just interleave
4516template <typename T, size_t N, HWY_IF_GE64(T, N)>
4518 Vec128<T, N> lo) {
4519 // Treat half-width input as a single lane and interleave them.
4520 const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
4521 return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi)));
4522}
4523
4524// <= 32-bit input/output
4525template <typename T, size_t N, HWY_IF_LE32(T, N)>
4526HWY_API Vec128<T, N> ConcatUpperUpper(const Simd<T, N, 0> d, Vec128<T, N> hi,
4527 Vec128<T, N> lo) {
4528 // Treat half-width input as two lanes and take every second one.
4529 const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
4530#if HWY_ARCH_ARM_A64
4531 return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi)));
4532#else
4533 using VU = VFromD<decltype(du)>;
4534 return BitCast(
4535 d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
4536 .val[1]));
4537#endif
4538}
4539
4540// ------------------------------ ConcatLowerUpper (ShiftLeftBytes)
4541
4542// 64 or 128-bit input: extract from concatenated
4543template <typename T, size_t N, HWY_IF_GE64(T, N)>
4545 Vec128<T, N> lo) {
4546 return CombineShiftRightBytes<N * sizeof(T) / 2>(d, hi, lo);
4547}
4548
4549// <= 32-bit input/output
4550template <typename T, size_t N, HWY_IF_LE32(T, N)>
4551HWY_API Vec128<T, N> ConcatLowerUpper(const Simd<T, N, 0> d, Vec128<T, N> hi,
4552 Vec128<T, N> lo) {
4553 constexpr size_t kSize = N * sizeof(T);
4554 const Repartition<uint8_t, decltype(d)> d8;
4555 const Full64<uint8_t> d8x8;
4556 const Full64<T> d64;
4557 using V8x8 = VFromD<decltype(d8x8)>;
4558 const V8x8 hi8x8(BitCast(d8, hi).raw);
4559 // Move into most-significant bytes
4560 const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw));
4561 const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8);
4562 // Back to original lane type, then shrink N.
4563 return Vec128<T, N>(BitCast(d64, r).raw);
4564}
4565
4566// ------------------------------ ConcatUpperLower
4567
4568// Works for all N.
4569template <typename T, size_t N>
4574
4575// ------------------------------ ConcatOdd (InterleaveUpper)
4576
4577namespace detail {
4578// There is no vuzpq_u64.
4579HWY_NEON_DEF_FUNCTION_UIF81632(ConcatEven, vuzp1, _, 2)
4580HWY_NEON_DEF_FUNCTION_UIF81632(ConcatOdd, vuzp2, _, 2)
4581} // namespace detail
4582
4583// Full/half vector
4584template <typename T, size_t N,
4585 hwy::EnableIf<N != 2 && sizeof(T) * N >= 8>* = nullptr>
4586HWY_API Vec128<T, N> ConcatOdd(Simd<T, N, 0> /* tag */, Vec128<T, N> hi,
4587 Vec128<T, N> lo) {
4588 return detail::ConcatOdd(lo, hi);
4589}
4590
4591// 8-bit x4
4592template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4594 Vec128<T, 4> lo) {
4595 const Twice<decltype(d)> d2;
4596 const Repartition<uint16_t, decltype(d2)> dw2;
4597 const VFromD<decltype(d2)> hi2(hi.raw);
4598 const VFromD<decltype(d2)> lo2(lo.raw);
4599 const VFromD<decltype(dw2)> Hx1Lx1 = BitCast(dw2, ConcatOdd(d2, hi2, lo2));
4600 // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
4601 // vcopy_lane_u16, but that's A64-only.
4602 return Vec128<T, 4>(BitCast(d2, ConcatEven(dw2, Hx1Lx1, Hx1Lx1)).raw);
4603}
4604
4605// Any type x2
4606template <typename T>
4608 Vec128<T, 2> lo) {
4609 return InterleaveUpper(d, lo, hi);
4610}
4611
4612// ------------------------------ ConcatEven (InterleaveLower)
4613
4614// Full/half vector
4615template <typename T, size_t N,
4616 hwy::EnableIf<N != 2 && sizeof(T) * N >= 8>* = nullptr>
4618 Vec128<T, N> lo) {
4619 return detail::ConcatEven(lo, hi);
4620}
4621
4622// 8-bit x4
4623template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4625 Vec128<T, 4> lo) {
4626 const Twice<decltype(d)> d2;
4627 const Repartition<uint16_t, decltype(d2)> dw2;
4628 const VFromD<decltype(d2)> hi2(hi.raw);
4629 const VFromD<decltype(d2)> lo2(lo.raw);
4630 const VFromD<decltype(dw2)> Hx0Lx0 = BitCast(dw2, ConcatEven(d2, hi2, lo2));
4631 // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
4632 // vcopy_lane_u16, but that's A64-only.
4633 return Vec128<T, 4>(BitCast(d2, ConcatEven(dw2, Hx0Lx0, Hx0Lx0)).raw);
4634}
4635
4636// Any type x2
4637template <typename T>
4642
4643// ------------------------------ DupEven (InterleaveLower)
4644
4645template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4647#if HWY_ARCH_ARM_A64
4648 return detail::InterleaveEven(v, v);
4649#else
4650 return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[0]);
4651#endif
4652}
4653
4654template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4655HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
4656 return InterleaveLower(Simd<T, N, 0>(), v, v);
4657}
4658
4659// ------------------------------ DupOdd (InterleaveUpper)
4660
4661template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4663#if HWY_ARCH_ARM_A64
4664 return detail::InterleaveOdd(v, v);
4665#else
4666 return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[1]);
4667#endif
4668}
4669
4670template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4671HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
4672 return InterleaveUpper(Simd<T, N, 0>(), v, v);
4673}
4674
4675// ------------------------------ OddEven (IfThenElse)
4676
4677template <typename T, size_t N>
4679 const Simd<T, N, 0> d;
4680 const Repartition<uint8_t, decltype(d)> d8;
4681 alignas(16) constexpr uint8_t kBytes[16] = {
4682 ((0 / sizeof(T)) & 1) ? 0 : 0xFF, ((1 / sizeof(T)) & 1) ? 0 : 0xFF,
4683 ((2 / sizeof(T)) & 1) ? 0 : 0xFF, ((3 / sizeof(T)) & 1) ? 0 : 0xFF,
4684 ((4 / sizeof(T)) & 1) ? 0 : 0xFF, ((5 / sizeof(T)) & 1) ? 0 : 0xFF,
4685 ((6 / sizeof(T)) & 1) ? 0 : 0xFF, ((7 / sizeof(T)) & 1) ? 0 : 0xFF,
4686 ((8 / sizeof(T)) & 1) ? 0 : 0xFF, ((9 / sizeof(T)) & 1) ? 0 : 0xFF,
4687 ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF,
4688 ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF,
4689 ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF,
4690 };
4691 const auto vec = BitCast(d, Load(d8, kBytes));
4692 return IfThenElse(MaskFromVec(vec), b, a);
4693}
4694
4695// ------------------------------ OddEvenBlocks
4696template <typename T, size_t N>
4698 return even;
4699}
4700
4701// ------------------------------ SwapAdjacentBlocks
4702
4703template <typename T, size_t N>
4707
4708// ------------------------------ ReverseBlocks
4709
4710// Single block: no change
4711template <typename T>
4713 return v;
4714}
4715
4716// ------------------------------ ReorderDemote2To (OddEven)
4717
4718template <size_t N>
4721 const RebindToUnsigned<decltype(dbf16)> du16;
4722 const Repartition<uint32_t, decltype(dbf16)> du32;
4723 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
4724 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4725}
4726
4729 const Vec64<int16_t> a16(vqmovn_s32(a.raw));
4730#if HWY_ARCH_ARM_A64
4731 (void)d16;
4732 return Vec128<int16_t>(vqmovn_high_s32(a16.raw, b.raw));
4733#else
4734 const Vec64<int16_t> b16(vqmovn_s32(b.raw));
4735 return Combine(d16, a16, b16);
4736#endif
4737}
4738
4741 const Full128<int32_t> d32;
4742 const Vec128<int32_t> ab = Combine(d32, a, b);
4743 return Vec64<int16_t>(vqmovn_s32(ab.raw));
4744}
4745
4748 const Full128<int32_t> d32;
4749 const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw));
4750 return Vec32<int16_t>(vqmovn_s32(Combine(d32, ab, ab).raw));
4751}
4752
4753// ================================================== CRYPTO
4754
4755#if defined(__ARM_FEATURE_AES) || \
4756 (HWY_HAVE_RUNTIME_DISPATCH && HWY_ARCH_ARM_A64)
4757
4758// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
4759#ifdef HWY_NATIVE_AES
4760#undef HWY_NATIVE_AES
4761#else
4762#define HWY_NATIVE_AES
4763#endif
4764
4765HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
4766 Vec128<uint8_t> round_key) {
4767 // NOTE: it is important that AESE and AESMC be consecutive instructions so
4768 // they can be fused. AESE includes AddRoundKey, which is a different ordering
4769 // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual
4770 // round key (the compiler will hopefully optimize this for multiple rounds).
4771 return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
4772 round_key;
4773}
4774
4775HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
4776 Vec128<uint8_t> round_key) {
4777 return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
4778}
4779
4780HWY_API Vec128<uint64_t> CLMulLower(Vec128<uint64_t> a, Vec128<uint64_t> b) {
4781 return Vec128<uint64_t>((uint64x2_t)vmull_p64(GetLane(a), GetLane(b)));
4782}
4783
4784HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) {
4785 return Vec128<uint64_t>(
4786 (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
4787}
4788
4789#endif // __ARM_FEATURE_AES
4790
4791// ================================================== MISC
4792
4793template <size_t N>
4795 const Vec128<bfloat16_t, N> v) {
4796 const Rebind<uint16_t, decltype(df32)> du16;
4797 const RebindToSigned<decltype(df32)> di32;
4798 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
4799}
4800
4801// ------------------------------ Truncations
4802
4803template <typename From, typename To, HWY_IF_UNSIGNED(From),
4804 HWY_IF_UNSIGNED(To),
4805 hwy::EnableIf<(sizeof(To) < sizeof(From))>* = nullptr>
4807 const Vec128<From, 1> v) {
4808 const Repartition<To, DFromV<decltype(v)>> d;
4809 const auto v1 = BitCast(d, v);
4810 return Vec128<To, 1>{v1.raw};
4811}
4812
4814 const Vec128<uint64_t, 2> v) {
4815 const Repartition<uint8_t, DFromV<decltype(v)>> d;
4816 const auto v1 = BitCast(d, v);
4817 const auto v2 = detail::ConcatEven(v1, v1);
4818 const auto v3 = detail::ConcatEven(v2, v2);
4819 const auto v4 = detail::ConcatEven(v3, v3);
4820 return LowerHalf(LowerHalf(LowerHalf(v4)));
4821}
4822
4824 const Vec128<uint64_t, 2> v) {
4825 const Repartition<uint16_t, DFromV<decltype(v)>> d;
4826 const auto v1 = BitCast(d, v);
4827 const auto v2 = detail::ConcatEven(v1, v1);
4828 const auto v3 = detail::ConcatEven(v2, v2);
4829 return LowerHalf(LowerHalf(v3));
4830}
4831
4833 const Vec128<uint64_t, 2> v) {
4834 const Repartition<uint32_t, DFromV<decltype(v)>> d;
4835 const auto v1 = BitCast(d, v);
4836 const auto v2 = detail::ConcatEven(v1, v1);
4837 return LowerHalf(v2);
4838}
4839
4840template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
4842 const Vec128<uint32_t, N> v) {
4843 const Repartition<uint8_t, DFromV<decltype(v)>> d;
4844 const auto v1 = BitCast(d, v);
4845 const auto v2 = detail::ConcatEven(v1, v1);
4846 const auto v3 = detail::ConcatEven(v2, v2);
4847 return LowerHalf(LowerHalf(v3));
4848}
4849
4850template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
4852 const Vec128<uint32_t, N> v) {
4853 const Repartition<uint16_t, DFromV<decltype(v)>> d;
4854 const auto v1 = BitCast(d, v);
4855 const auto v2 = detail::ConcatEven(v1, v1);
4856 return LowerHalf(v2);
4857}
4858
4859template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
4861 const Vec128<uint16_t, N> v) {
4862 const Repartition<uint8_t, DFromV<decltype(v)>> d;
4863 const auto v1 = BitCast(d, v);
4864 const auto v2 = detail::ConcatEven(v1, v1);
4865 return LowerHalf(v2);
4866}
4867
4868// ------------------------------ MulEven (ConcatEven)
4869
4870// Multiplies even lanes (0, 2 ..) and places the double-wide result into
4871// even and the upper half into its odd neighbor lane.
4873 const Full128<int32_t> d;
4874 int32x4_t a_packed = ConcatEven(d, a, a).raw;
4875 int32x4_t b_packed = ConcatEven(d, b, b).raw;
4876 return Vec128<int64_t>(
4877 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
4878}
4880 const Full128<uint32_t> d;
4881 uint32x4_t a_packed = ConcatEven(d, a, a).raw;
4882 uint32x4_t b_packed = ConcatEven(d, b, b).raw;
4883 return Vec128<uint64_t>(
4884 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
4885}
4886
4887template <size_t N>
4888HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
4889 const Vec128<int32_t, N> b) {
4890 const DFromV<decltype(a)> d;
4891 int32x2_t a_packed = ConcatEven(d, a, a).raw;
4892 int32x2_t b_packed = ConcatEven(d, b, b).raw;
4893 return Vec128<int64_t, (N + 1) / 2>(
4894 vget_low_s64(vmull_s32(a_packed, b_packed)));
4895}
4896template <size_t N>
4897HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
4898 const Vec128<uint32_t, N> b) {
4899 const DFromV<decltype(a)> d;
4900 uint32x2_t a_packed = ConcatEven(d, a, a).raw;
4901 uint32x2_t b_packed = ConcatEven(d, b, b).raw;
4902 return Vec128<uint64_t, (N + 1) / 2>(
4903 vget_low_u64(vmull_u32(a_packed, b_packed)));
4904}
4905
4907 uint64_t hi;
4908 uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 0), vgetq_lane_u64(b.raw, 0), &hi);
4909 return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
4910}
4911
4913 uint64_t hi;
4914 uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 1), vgetq_lane_u64(b.raw, 1), &hi);
4915 return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
4916}
4917
4918// ------------------------------ TableLookupBytes (Combine, LowerHalf)
4919
4920// Both full
4921template <typename T, typename TI>
4923 const Vec128<TI> from) {
4924 const Full128<TI> d;
4925 const Repartition<uint8_t, decltype(d)> d8;
4926#if HWY_ARCH_ARM_A64
4927 return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
4928 BitCast(d8, from).raw)));
4929#else
4930 uint8x16_t table0 = BitCast(d8, bytes).raw;
4931 uint8x8x2_t table;
4932 table.val[0] = vget_low_u8(table0);
4933 table.val[1] = vget_high_u8(table0);
4934 uint8x16_t idx = BitCast(d8, from).raw;
4935 uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
4936 uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
4937 return BitCast(d, Vec128<uint8_t>(vcombine_u8(low, hi)));
4938#endif
4939}
4940
4941// Partial index vector
4942template <typename T, typename TI, size_t NI, HWY_IF_LE64(TI, NI)>
4944 const Vec128<TI, NI> from) {
4945 const Full128<TI> d_full;
4946 const Vec64<TI> from64(from.raw);
4947 const auto idx_full = Combine(d_full, from64, from64);
4948 const auto out_full = TableLookupBytes(bytes, idx_full);
4949 return Vec128<TI, NI>(LowerHalf(Half<decltype(d_full)>(), out_full).raw);
4950}
4951
4952// Partial table vector
4953template <typename T, size_t N, typename TI, HWY_IF_LE64(T, N)>
4955 const Vec128<TI> from) {
4956 const Full128<T> d_full;
4957 return TableLookupBytes(Combine(d_full, bytes, bytes), from);
4958}
4959
4960// Partial both
4961template <typename T, size_t N, typename TI, size_t NI, HWY_IF_LE64(T, N),
4962 HWY_IF_LE64(TI, NI)>
4964 Vec128<T, N> bytes, Vec128<TI, NI> from) {
4965 const Simd<T, N, 0> d;
4966 const Simd<TI, NI, 0> d_idx;
4967 const Repartition<uint8_t, decltype(d_idx)> d_idx8;
4968 // uint8x8
4969 const auto bytes8 = BitCast(Repartition<uint8_t, decltype(d)>(), bytes);
4970 const auto from8 = BitCast(d_idx8, from);
4971 const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
4972 return BitCast(d_idx, v8);
4973}
4974
4975// For all vector widths; ARM anyway zeroes if >= 0x10.
4976template <class V, class VI>
4977HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
4978 return TableLookupBytes(bytes, from);
4979}
4980
4981// ------------------------------ Scatter (Store)
4982
4983template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
4985 T* HWY_RESTRICT base,
4986 const Vec128<Offset, N> offset) {
4987 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
4988
4989 alignas(16) T lanes[N];
4990 Store(v, d, lanes);
4991
4992 alignas(16) Offset offset_lanes[N];
4993 Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
4994
4995 uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
4996 for (size_t i = 0; i < N; ++i) {
4997 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
4998 }
4999}
5000
5001template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
5003 const Vec128<Index, N> index) {
5004 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
5005
5006 alignas(16) T lanes[N];
5007 Store(v, d, lanes);
5008
5009 alignas(16) Index index_lanes[N];
5010 Store(index, Rebind<Index, decltype(d)>(), index_lanes);
5011
5012 for (size_t i = 0; i < N; ++i) {
5013 base[index_lanes[i]] = lanes[i];
5014 }
5015}
5016
5017// ------------------------------ Gather (Load/Store)
5018
5019template <typename T, size_t N, typename Offset>
5021 const T* HWY_RESTRICT base,
5022 const Vec128<Offset, N> offset) {
5023 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
5024
5025 alignas(16) Offset offset_lanes[N];
5026 Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
5027
5028 alignas(16) T lanes[N];
5029 const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
5030 for (size_t i = 0; i < N; ++i) {
5031 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
5032 }
5033 return Load(d, lanes);
5034}
5035
5036template <typename T, size_t N, typename Index>
5038 const T* HWY_RESTRICT base,
5039 const Vec128<Index, N> index) {
5040 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
5041
5042 alignas(16) Index index_lanes[N];
5043 Store(index, Rebind<Index, decltype(d)>(), index_lanes);
5044
5045 alignas(16) T lanes[N];
5046 for (size_t i = 0; i < N; ++i) {
5047 lanes[i] = base[index_lanes[i]];
5048 }
5049 return Load(d, lanes);
5050}
5051
5052// ------------------------------ Reductions
5053
5054namespace detail {
5055
5056// N=1 for any T: no-op
5057template <typename T>
5059 const Vec128<T, 1> v) {
5060 return v;
5061}
5062template <typename T>
5064 const Vec128<T, 1> v) {
5065 return v;
5066}
5067template <typename T>
5069 const Vec128<T, 1> v) {
5070 return v;
5071}
5072
5073// full vectors
5074#if HWY_ARCH_ARM_A64
5075#define HWY_NEON_BUILD_RET_REDUCTION(type, size) Vec128<type##_t, size>
5076#define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix, dup) \
5077 HWY_API HWY_NEON_BUILD_RET_REDUCTION(type, size) \
5078 name(hwy::SizeTag<sizeof(type##_t)>, const Vec128<type##_t, size> v) { \
5079 return HWY_NEON_BUILD_RET_REDUCTION( \
5080 type, size)(dup##suffix(HWY_NEON_EVAL(prefix##infix##suffix, v.raw))); \
5081 }
5082
5083#define HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix) \
5084 HWY_NEON_DEF_REDUCTION(uint8, 8, name, prefix, _, u8, vdup_n_) \
5085 HWY_NEON_DEF_REDUCTION(uint8, 16, name, prefix##q, _, u8, vdupq_n_) \
5086 HWY_NEON_DEF_REDUCTION(uint16, 4, name, prefix, _, u16, vdup_n_) \
5087 HWY_NEON_DEF_REDUCTION(uint16, 8, name, prefix##q, _, u16, vdupq_n_) \
5088 HWY_NEON_DEF_REDUCTION(uint32, 2, name, prefix, _, u32, vdup_n_) \
5089 HWY_NEON_DEF_REDUCTION(uint32, 4, name, prefix##q, _, u32, vdupq_n_) \
5090 HWY_NEON_DEF_REDUCTION(int8, 8, name, prefix, _, s8, vdup_n_) \
5091 HWY_NEON_DEF_REDUCTION(int8, 16, name, prefix##q, _, s8, vdupq_n_) \
5092 HWY_NEON_DEF_REDUCTION(int16, 4, name, prefix, _, s16, vdup_n_) \
5093 HWY_NEON_DEF_REDUCTION(int16, 8, name, prefix##q, _, s16, vdupq_n_) \
5094 HWY_NEON_DEF_REDUCTION(int32, 2, name, prefix, _, s32, vdup_n_) \
5095 HWY_NEON_DEF_REDUCTION(int32, 4, name, prefix##q, _, s32, vdupq_n_) \
5096 HWY_NEON_DEF_REDUCTION(float32, 2, name, prefix, _, f32, vdup_n_) \
5097 HWY_NEON_DEF_REDUCTION(float32, 4, name, prefix##q, _, f32, vdupq_n_) \
5098 HWY_NEON_DEF_REDUCTION(float64, 2, name, prefix##q, _, f64, vdupq_n_)
5099
5100HWY_NEON_DEF_REDUCTION_CORE_TYPES(MinOfLanes, vminv)
5101HWY_NEON_DEF_REDUCTION_CORE_TYPES(MaxOfLanes, vmaxv)
5102
5103// u64/s64 don't have horizontal min/max for some reason, but do have add.
5104#define HWY_NEON_DEF_REDUCTION_ALL_TYPES(name, prefix) \
5105 HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix) \
5106 HWY_NEON_DEF_REDUCTION(uint64, 2, name, prefix##q, _, u64, vdupq_n_) \
5107 HWY_NEON_DEF_REDUCTION(int64, 2, name, prefix##q, _, s64, vdupq_n_)
5108
5109HWY_NEON_DEF_REDUCTION_ALL_TYPES(SumOfLanes, vaddv)
5110
5111#undef HWY_NEON_DEF_REDUCTION_ALL_TYPES
5112#undef HWY_NEON_DEF_REDUCTION_CORE_TYPES
5113#undef HWY_NEON_DEF_REDUCTION
5114#undef HWY_NEON_BUILD_RET_REDUCTION
5115
5116// Need some fallback implementations for [ui]64x2 and [ui]16x2.
5117#define HWY_IF_SUM_REDUCTION(T) HWY_IF_LANE_SIZE_ONE_OF(T, 1 << 2)
5118#define HWY_IF_MINMAX_REDUCTION(T) \
5119 HWY_IF_LANE_SIZE_ONE_OF(T, (1 << 8) | (1 << 2))
5120
5121#else
5122// u32/i32/f32: N=2
5123template <typename T, HWY_IF_LANE_SIZE(T, 4)>
5125 const Vec128<T, 2> v10) {
5126 return v10 + Shuffle2301(v10);
5127}
5128template <typename T>
5130 const Vec128<T, 2> v10) {
5131 return Min(v10, Shuffle2301(v10));
5132}
5133template <typename T>
5135 const Vec128<T, 2> v10) {
5136 return Max(v10, Shuffle2301(v10));
5137}
5138
5139// ARMv7 version for everything except doubles.
5141 const Vec128<uint32_t> v) {
5142 uint32x4x2_t v0 = vuzpq_u32(v.raw, v.raw);
5143 uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
5144 uint32x4x2_t v1 = vuzpq_u32(c0, c0);
5145 return Vec128<uint32_t>(vaddq_u32(v1.val[0], v1.val[1]));
5146}
5148 const Vec128<int32_t> v) {
5149 int32x4x2_t v0 = vuzpq_s32(v.raw, v.raw);
5150 int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
5151 int32x4x2_t v1 = vuzpq_s32(c0, c0);
5152 return Vec128<int32_t>(vaddq_s32(v1.val[0], v1.val[1]));
5153}
5155 const Vec128<float> v) {
5156 float32x4x2_t v0 = vuzpq_f32(v.raw, v.raw);
5157 float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
5158 float32x4x2_t v1 = vuzpq_f32(c0, c0);
5159 return Vec128<float>(vaddq_f32(v1.val[0], v1.val[1]));
5160}
5162 const Vec128<uint64_t> v) {
5163 return v + Shuffle01(v);
5164}
5166 const Vec128<int64_t> v) {
5167 return v + Shuffle01(v);
5168}
5169
5170template <typename T>
5172 const Vec128<T> v3210) {
5173 const Vec128<T> v1032 = Shuffle1032(v3210);
5174 const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
5175 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5176 return Min(v20_31_20_31, v31_20_31_20);
5177}
5178template <typename T>
5180 const Vec128<T> v3210) {
5181 const Vec128<T> v1032 = Shuffle1032(v3210);
5182 const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
5183 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5184 return Max(v20_31_20_31, v31_20_31_20);
5185}
5186
5187#define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t
5188#define HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) Vec128<type##_t, size>
5189#define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \
5190 HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) \
5191 name(hwy::SizeTag<sizeof(type##_t)>, const Vec128<type##_t, size> v) { \
5192 HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \
5193 if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5194 if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5195 return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION( \
5196 type, size)(HWY_NEON_EVAL(vdup##_lane_##suffix, tmp, 0)); \
5197 }
5198#define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \
5199 suffix) \
5200 HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) \
5201 name(hwy::SizeTag<sizeof(type##_t)>, const Vec128<type##_t, size> v) { \
5202 HWY_NEON_BUILD_TYPE_T(type, half) tmp; \
5203 tmp = prefix##_##suffix(vget_high_##suffix(v.raw), \
5204 vget_low_##suffix(v.raw)); \
5205 if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5206 if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5207 if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5208 tmp = vdup_lane_##suffix(tmp, 0); \
5209 return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION( \
5210 type, size)(HWY_NEON_EVAL(vcombine_##suffix, tmp, tmp)); \
5211 }
5212
5213#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix) \
5214 HWY_NEON_DEF_PAIRWISE_REDUCTION(uint16, 4, name, prefix, u16) \
5215 HWY_NEON_DEF_PAIRWISE_REDUCTION(uint8, 8, name, prefix, u8) \
5216 HWY_NEON_DEF_PAIRWISE_REDUCTION(int16, 4, name, prefix, s16) \
5217 HWY_NEON_DEF_PAIRWISE_REDUCTION(int8, 8, name, prefix, s8) \
5218 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint16, 8, 4, name, prefix, u16) \
5219 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint8, 16, 8, name, prefix, u8) \
5220 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int16, 8, 4, name, prefix, s16) \
5221 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int8, 16, 8, name, prefix, s8)
5222
5223HWY_NEON_DEF_PAIRWISE_REDUCTIONS(SumOfLanes, vpadd)
5224HWY_NEON_DEF_PAIRWISE_REDUCTIONS(MinOfLanes, vpmin)
5225HWY_NEON_DEF_PAIRWISE_REDUCTIONS(MaxOfLanes, vpmax)
5226
5227#undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS
5228#undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION
5229#undef HWY_NEON_DEF_PAIRWISE_REDUCTION
5230#undef HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION
5231#undef HWY_NEON_BUILD_TYPE_T
5232
5233template <size_t N, HWY_IF_GE32(uint16_t, N)>
5236 const Simd<uint16_t, N, 0> d;
5237 const RepartitionToWide<decltype(d)> d32;
5238 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5239 const auto odd = ShiftRight<16>(BitCast(d32, v));
5240 const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
5241 // Also broadcast into odd lanes.
5242 return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
5243}
5244template <size_t N, HWY_IF_GE32(int16_t, N)>
5247 const Simd<int16_t, N, 0> d;
5248 const RepartitionToWide<decltype(d)> d32;
5249 // Sign-extend
5250 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
5251 const auto odd = ShiftRight<16>(BitCast(d32, v));
5252 const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
5253 // Also broadcast into odd lanes.
5254 return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
5255}
5256
5257template <size_t N, HWY_IF_GE32(uint16_t, N)>
5260 const Simd<uint16_t, N, 0> d;
5261 const RepartitionToWide<decltype(d)> d32;
5262 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5263 const auto odd = ShiftRight<16>(BitCast(d32, v));
5264 const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
5265 // Also broadcast into odd lanes.
5266 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
5267}
5268template <size_t N, HWY_IF_GE32(int16_t, N)>
5271 const Simd<int16_t, N, 0> d;
5272 const RepartitionToWide<decltype(d)> d32;
5273 // Sign-extend
5274 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
5275 const auto odd = ShiftRight<16>(BitCast(d32, v));
5276 const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
5277 // Also broadcast into odd lanes.
5278 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
5279}
5280
5281template <size_t N, HWY_IF_GE32(uint16_t, N)>
5284 const Simd<uint16_t, N, 0> d;
5285 const RepartitionToWide<decltype(d)> d32;
5286 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5287 const auto odd = ShiftRight<16>(BitCast(d32, v));
5288 const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
5289 // Also broadcast into odd lanes.
5290 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
5291}
5292template <size_t N, HWY_IF_GE32(int16_t, N)>
5295 const Simd<int16_t, N, 0> d;
5296 const RepartitionToWide<decltype(d)> d32;
5297 // Sign-extend
5298 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
5299 const auto odd = ShiftRight<16>(BitCast(d32, v));
5300 const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
5301 // Also broadcast into odd lanes.
5302 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
5303}
5304
5305// Need fallback min/max implementations for [ui]64x2.
5306#define HWY_IF_SUM_REDUCTION(T) HWY_IF_LANE_SIZE_ONE_OF(T, 0)
5307#define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_LANE_SIZE_ONE_OF(T, 1 << 8)
5308
5309#endif
5310
5311// [ui]16/[ui]64: N=2 -- special case for pairs of very small or large lanes
5312template <typename T, HWY_IF_SUM_REDUCTION(T)>
5314 const Vec128<T, 2> v10) {
5315 return v10 + Reverse2(Simd<T, 2, 0>(), v10);
5316}
5317template <typename T, HWY_IF_MINMAX_REDUCTION(T)>
5319 const Vec128<T, 2> v10) {
5320 return Min(v10, Reverse2(Simd<T, 2, 0>(), v10));
5321}
5322template <typename T, HWY_IF_MINMAX_REDUCTION(T)>
5324 const Vec128<T, 2> v10) {
5325 return Max(v10, Reverse2(Simd<T, 2, 0>(), v10));
5326}
5327
5328#undef HWY_IF_SUM_REDUCTION
5329#undef HWY_IF_MINMAX_REDUCTION
5330
5331} // namespace detail
5332
5333template <typename T, size_t N>
5334HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
5335 return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
5336}
5337template <typename T, size_t N>
5338HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
5339 return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
5340}
5341template <typename T, size_t N>
5342HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
5343 return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
5344}
5345
5346// ------------------------------ LoadMaskBits (TestBit)
5347
5348namespace detail {
5349
5350// Helper function to set 64 bits and potentially return a smaller vector. The
5351// overload is required to call the q vs non-q intrinsics. Note that 8-bit
5352// LoadMaskBits only requires 16 bits, but 64 avoids casting.
5353template <typename T, size_t N, HWY_IF_LE64(T, N)>
5354HWY_INLINE Vec128<T, N> Set64(Simd<T, N, 0> /* tag */, uint64_t mask_bits) {
5355 const auto v64 = Vec64<uint64_t>(vdup_n_u64(mask_bits));
5356 return Vec128<T, N>(BitCast(Full64<T>(), v64).raw);
5357}
5358template <typename T>
5359HWY_INLINE Vec128<T> Set64(Full128<T> d, uint64_t mask_bits) {
5360 return BitCast(d, Vec128<uint64_t>(vdupq_n_u64(mask_bits)));
5361}
5362
5363template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
5365 const RebindToUnsigned<decltype(d)> du;
5366 // Easier than Set(), which would require an >8-bit type, which would not
5367 // compile for T=uint8_t, N=1.
5368 const auto vmask_bits = Set64(du, mask_bits);
5369
5370 // Replicate bytes 8x such that each byte contains the bit that governs it.
5371 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
5372 1, 1, 1, 1, 1, 1, 1, 1};
5373 const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8));
5374
5375 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
5376 1, 2, 4, 8, 16, 32, 64, 128};
5377 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
5378}
5379
5380template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
5381HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
5382 const RebindToUnsigned<decltype(d)> du;
5383 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
5384 const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
5385 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
5386}
5387
5388template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
5389HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
5390 const RebindToUnsigned<decltype(d)> du;
5391 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
5392 const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
5393 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
5394}
5395
5396template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
5397HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
5398 const RebindToUnsigned<decltype(d)> du;
5399 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
5400 return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
5401}
5402
5403} // namespace detail
5404
5405// `p` points to at least 8 readable bytes, not all of which need be valid.
5406template <typename T, size_t N, HWY_IF_LE128(T, N)>
5408 const uint8_t* HWY_RESTRICT bits) {
5409 uint64_t mask_bits = 0;
5410 CopyBytes<(N + 7) / 8>(bits, &mask_bits);
5411 return detail::LoadMaskBits(d, mask_bits);
5412}
5413
5414// ------------------------------ Mask
5415
5416namespace detail {
5417
5418// Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than
5419// BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse.
5420template <typename T>
5422 const Full128<uint16_t> du16;
5423 const Vec128<uint16_t> vu16 = BitCast(du16, VecFromMask(d, mask));
5424 const Vec64<uint8_t> nib(vshrn_n_u16(vu16.raw, 4));
5425 return GetLane(BitCast(Full64<uint64_t>(), nib));
5426}
5427
5428template <typename T>
5430 // There is no vshrn_n_u16 for uint16x4, so zero-extend.
5431 const Twice<decltype(d)> d2;
5432 const Vec128<T> v128 = ZeroExtendVector(d2, VecFromMask(d, mask));
5433 // No need to mask, upper half is zero thanks to ZeroExtendVector.
5434 return NibblesFromMask(d2, MaskFromVec(v128));
5435}
5436
5437template <typename T, size_t N, HWY_IF_LE32(T, N)>
5439 const Mask64<T> mask64(mask.raw);
5440 const uint64_t nib = NibblesFromMask(Full64<T>(), mask64);
5441 // Clear nibbles from upper half of 64-bits
5442 constexpr size_t kBytes = sizeof(T) * N;
5443 return nib & ((1ull << (kBytes * 4)) - 1);
5444}
5445
5446template <typename T>
5448 const Mask128<T> mask) {
5449 alignas(16) constexpr uint8_t kSliceLanes[16] = {
5450 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
5451 };
5452 const Full128<uint8_t> du;
5453 const Vec128<uint8_t> values =
5454 BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);
5455
5456#if HWY_ARCH_ARM_A64
5457 // Can't vaddv - we need two separate bytes (16 bits).
5458 const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
5459 const uint8x8_t x4 = vpadd_u8(x2, x2);
5460 const uint8x8_t x8 = vpadd_u8(x4, x4);
5461 return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
5462#else
5463 // Don't have vpaddq, so keep doubling lane size.
5464 const uint16x8_t x2 = vpaddlq_u8(values.raw);
5465 const uint32x4_t x4 = vpaddlq_u16(x2);
5466 const uint64x2_t x8 = vpaddlq_u32(x4);
5467 return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
5468#endif
5469}
5470
5471template <typename T, size_t N, HWY_IF_LE64(T, N)>
5473 const Mask128<T, N> mask) {
5474 // Upper lanes of partial loads are undefined. OnlyActive will fix this if
5475 // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
5476 alignas(8) constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
5477 0x10, 0x20, 0x40, 0x80};
5478 const Simd<T, N, 0> d;
5479 const RebindToUnsigned<decltype(d)> du;
5480 const Vec128<uint8_t, N> slice(Load(Full64<uint8_t>(), kSliceLanes).raw);
5481 const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
5482
5483#if HWY_ARCH_ARM_A64
5484 return vaddv_u8(values.raw);
5485#else
5486 const uint16x4_t x2 = vpaddl_u8(values.raw);
5487 const uint32x2_t x4 = vpaddl_u16(x2);
5488 const uint64x1_t x8 = vpaddl_u32(x4);
5489 return vget_lane_u64(x8, 0);
5490#endif
5491}
5492
5493template <typename T>
5495 const Mask128<T> mask) {
5496 alignas(16) constexpr uint16_t kSliceLanes[8] = {1, 2, 4, 8,
5497 0x10, 0x20, 0x40, 0x80};
5498 const Full128<T> d;
5499 const Full128<uint16_t> du;
5500 const Vec128<uint16_t> values =
5501 BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
5502#if HWY_ARCH_ARM_A64
5503 return vaddvq_u16(values.raw);
5504#else
5505 const uint32x4_t x2 = vpaddlq_u16(values.raw);
5506 const uint64x2_t x4 = vpaddlq_u32(x2);
5507 return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
5508#endif
5509}
5510
5511template <typename T, size_t N, HWY_IF_LE64(T, N)>
5513 const Mask128<T, N> mask) {
5514 // Upper lanes of partial loads are undefined. OnlyActive will fix this if
5515 // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
5516 alignas(8) constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
5517 const Simd<T, N, 0> d;
5518 const RebindToUnsigned<decltype(d)> du;
5519 const Vec128<uint16_t, N> slice(Load(Full64<uint16_t>(), kSliceLanes).raw);
5520 const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
5521#if HWY_ARCH_ARM_A64
5522 return vaddv_u16(values.raw);
5523#else
5524 const uint32x2_t x2 = vpaddl_u16(values.raw);
5525 const uint64x1_t x4 = vpaddl_u32(x2);
5526 return vget_lane_u64(x4, 0);
5527#endif
5528}
5529
5530template <typename T>
5532 const Mask128<T> mask) {
5533 alignas(16) constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
5534 const Full128<T> d;
5535 const Full128<uint32_t> du;
5536 const Vec128<uint32_t> values =
5537 BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
5538#if HWY_ARCH_ARM_A64
5539 return vaddvq_u32(values.raw);
5540#else
5541 const uint64x2_t x2 = vpaddlq_u32(values.raw);
5542 return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
5543#endif
5544}
5545
5546template <typename T, size_t N, HWY_IF_LE64(T, N)>
5548 const Mask128<T, N> mask) {
5549 // Upper lanes of partial loads are undefined. OnlyActive will fix this if
5550 // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
5551 alignas(8) constexpr uint32_t kSliceLanes[2] = {1, 2};
5552 const Simd<T, N, 0> d;
5553 const RebindToUnsigned<decltype(d)> du;
5554 const Vec128<uint32_t, N> slice(Load(Full64<uint32_t>(), kSliceLanes).raw);
5555 const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
5556#if HWY_ARCH_ARM_A64
5557 return vaddv_u32(values.raw);
5558#else
5559 const uint64x1_t x2 = vpaddl_u32(values.raw);
5560 return vget_lane_u64(x2, 0);
5561#endif
5562}
5563
5564template <typename T>
5566 alignas(16) constexpr uint64_t kSliceLanes[2] = {1, 2};
5567 const Full128<T> d;
5568 const Full128<uint64_t> du;
5569 const Vec128<uint64_t> values =
5570 BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
5571#if HWY_ARCH_ARM_A64
5572 return vaddvq_u64(values.raw);
5573#else
5574 return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
5575#endif
5576}
5577
5578template <typename T>
5580 const Mask128<T, 1> m) {
5581 const Full64<T> d;
5582 const Full64<uint64_t> du;
5583 const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, m)) & Set(du, 1);
5584 return vget_lane_u64(values.raw, 0);
5585}
5586
5587// Returns the lowest N for the BitsFromMask result.
5588template <typename T, size_t N>
5589constexpr uint64_t OnlyActive(uint64_t bits) {
5590 return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1));
5591}
5592
5593template <typename T, size_t N>
5595 return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
5596}
5597
5598// Returns number of lanes whose mask is set.
5599//
5600// Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op
5601// ("vsubv"). ANDing with 1 would work but requires a constant. Negating also
5602// changes each lane to 1 (if mask set) or 0.
5603// NOTE: PopCount also operates on vectors, so we still have to do horizontal
5604// sums separately. We specialize CountTrue for full vectors (negating instead
5605// of PopCount because it avoids an extra shift), and use PopCount of
5606// NibblesFromMask for partial vectors.
5607
5608template <typename T>
5609HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> mask) {
5610 const Full128<int8_t> di;
5611 const int8x16_t ones =
5612 vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
5613
5614#if HWY_ARCH_ARM_A64
5615 return static_cast<size_t>(vaddvq_s8(ones));
5616#else
5617 const int16x8_t x2 = vpaddlq_s8(ones);
5618 const int32x4_t x4 = vpaddlq_s16(x2);
5619 const int64x2_t x8 = vpaddlq_s32(x4);
5620 return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
5621#endif
5622}
5623template <typename T>
5624HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> mask) {
5625 const Full128<int16_t> di;
5626 const int16x8_t ones =
5627 vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
5628
5629#if HWY_ARCH_ARM_A64
5630 return static_cast<size_t>(vaddvq_s16(ones));
5631#else
5632 const int32x4_t x2 = vpaddlq_s16(ones);
5633 const int64x2_t x4 = vpaddlq_s32(x2);
5634 return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
5635#endif
5636}
5637
5638template <typename T>
5639HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> mask) {
5640 const Full128<int32_t> di;
5641 const int32x4_t ones =
5642 vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
5643
5644#if HWY_ARCH_ARM_A64
5645 return static_cast<size_t>(vaddvq_s32(ones));
5646#else
5647 const int64x2_t x2 = vpaddlq_s32(ones);
5648 return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
5649#endif
5650}
5651
5652template <typename T>
5653HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> mask) {
5654#if HWY_ARCH_ARM_A64
5655 const Full128<int64_t> di;
5656 const int64x2_t ones =
5657 vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
5658 return static_cast<size_t>(vaddvq_s64(ones));
5659#else
5660 const Full128<uint64_t> du;
5661 const auto mask_u = VecFromMask(du, RebindMask(du, mask));
5662 const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
5663 return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
5664#endif
5665}
5666
5667} // namespace detail
5668
5669// Full
5670template <typename T>
5671HWY_API size_t CountTrue(Full128<T> /* tag */, const Mask128<T> mask) {
5672 return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), mask);
5673}
5674
5675// Partial
5676template <typename T, size_t N, HWY_IF_LE64(T, N)>
5677HWY_API size_t CountTrue(Simd<T, N, 0> d, const Mask128<T, N> mask) {
5678 constexpr int kDiv = 4 * sizeof(T);
5679 return PopCount(detail::NibblesFromMask(d, mask)) / kDiv;
5680}
5681
5682template <typename T, size_t N>
5684 const Mask128<T, N> mask) {
5685 const uint64_t nib = detail::NibblesFromMask(d, mask);
5686 constexpr size_t kDiv = 4 * sizeof(T);
5687 return Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv;
5688}
5689
5690template <typename T, size_t N>
5692 const Mask128<T, N> mask) {
5693 const uint64_t nib = detail::NibblesFromMask(d, mask);
5694 if (nib == 0) return -1;
5695 constexpr int kDiv = 4 * sizeof(T);
5696 return static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv);
5697}
5698
5699// `p` points to at least 8 writable bytes.
5700template <typename T, size_t N>
5702 uint8_t* bits) {
5703 const uint64_t mask_bits = detail::BitsFromMask(mask);
5704 const size_t kNumBytes = (N + 7) / 8;
5705 CopyBytes<kNumBytes>(&mask_bits, bits);
5706 return kNumBytes;
5707}
5708
5709template <typename T, size_t N>
5711 return detail::NibblesFromMask(d, m) == 0;
5712}
5713
5714// Full
5715template <typename T>
5716HWY_API bool AllTrue(const Full128<T> d, const Mask128<T> m) {
5717 return detail::NibblesFromMask(d, m) == ~0ull;
5718}
5719// Partial
5720template <typename T, size_t N, HWY_IF_LE64(T, N)>
5722 constexpr size_t kBytes = sizeof(T) * N;
5723 return detail::NibblesFromMask(d, m) == (1ull << (kBytes * 4)) - 1;
5724}
5725
5726// ------------------------------ Compress
5727
5728template <typename T>
5730 enum { value = (sizeof(T) != 1) };
5731};
5732
5733namespace detail {
5734
5735// Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
5737 const uint8_t* bytes) {
5738 return Vec128<uint8_t>(vreinterpretq_u8_u64(
5739 vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
5740}
5741
5742// Load 8 bytes and return half-reg with N <= 8 bytes.
5743template <size_t N, HWY_IF_LE64(uint8_t, N)>
5745 const uint8_t* bytes) {
5746 return Load(d, bytes);
5747}
5748
5749template <typename T, size_t N>
5751 const uint64_t mask_bits) {
5752 HWY_DASSERT(mask_bits < 256);
5753 const Simd<T, N, 0> d;
5754 const Repartition<uint8_t, decltype(d)> d8;
5755 const Simd<uint16_t, N, 0> du;
5756
5757 // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
5758 // indices for VTBL (one vector's worth for each of 256 combinations of
5759 // 8 mask bits). Loading them directly would require 4 KiB. We can instead
5760 // store lane indices and convert to byte indices (2*lane + 0..1), with the
5761 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
5762 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
5763 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
5764 // is likely more costly than the higher cache footprint from storing bytes.
5765 alignas(16) constexpr uint8_t table[256 * 8] = {
5766 // PrintCompress16x8Tables
5767 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5768 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5769 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
5770 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5771 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
5772 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
5773 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
5774 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5775 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
5776 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
5777 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
5778 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
5779 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
5780 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
5781 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
5782 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5783 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
5784 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
5785 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
5786 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
5787 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
5788 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
5789 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
5790 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
5791 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
5792 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
5793 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
5794 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
5795 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
5796 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
5797 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
5798 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5799 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
5800 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
5801 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
5802 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
5803 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
5804 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
5805 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
5806 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
5807 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
5808 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
5809 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
5810 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
5811 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
5812 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
5813 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
5814 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
5815 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
5816 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
5817 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
5818 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
5819 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
5820 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
5821 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
5822 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
5823 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
5824 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
5825 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
5826 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
5827 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
5828 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
5829 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
5830 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5831 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
5832 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
5833 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
5834 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
5835 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
5836 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
5837 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
5838 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
5839 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
5840 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
5841 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
5842 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
5843 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
5844 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
5845 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
5846 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
5847 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
5848 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
5849 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
5850 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
5851 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
5852 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
5853 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
5854 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
5855 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
5856 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
5857 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
5858 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
5859 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
5860 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
5861 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
5862 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
5863 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
5864 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
5865 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
5866 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
5867 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
5868 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
5869 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
5870 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
5871 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
5872 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
5873 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
5874 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
5875 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
5876 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
5877 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
5878 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
5879 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
5880 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
5881 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
5882 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
5883 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
5884 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
5885 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
5886 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
5887 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
5888 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
5889 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
5890 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
5891 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
5892 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
5893 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
5894 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5895
5896 const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
5897 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
5898 return BitCast(d, pairs + Set(du, 0x0100));
5899}
5900
5901template <typename T, size_t N>
5903 const uint64_t mask_bits) {
5904 HWY_DASSERT(mask_bits < 256);
5905 const Simd<T, N, 0> d;
5906 const Repartition<uint8_t, decltype(d)> d8;
5907 const Simd<uint16_t, N, 0> du;
5908
5909 // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
5910 // indices for VTBL (one vector's worth for each of 256 combinations of
5911 // 8 mask bits). Loading them directly would require 4 KiB. We can instead
5912 // store lane indices and convert to byte indices (2*lane + 0..1), with the
5913 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
5914 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
5915 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
5916 // is likely more costly than the higher cache footprint from storing bytes.
5917 alignas(16) constexpr uint8_t table[256 * 8] = {
5918 // PrintCompressNot16x8Tables
5919 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, //
5920 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, //
5921 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, //
5922 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, //
5923 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, //
5924 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, //
5925 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, //
5926 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, //
5927 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, //
5928 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, //
5929 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, //
5930 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, //
5931 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, //
5932 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, //
5933 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, //
5934 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, //
5935 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, //
5936 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, //
5937 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, //
5938 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, //
5939 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, //
5940 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, //
5941 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, //
5942 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, //
5943 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, //
5944 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, //
5945 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, //
5946 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, //
5947 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, //
5948 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, //
5949 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, //
5950 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, //
5951 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, //
5952 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, //
5953 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, //
5954 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, //
5955 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, //
5956 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, //
5957 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, //
5958 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, //
5959 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, //
5960 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, //
5961 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, //
5962 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, //
5963 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, //
5964 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, //
5965 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, //
5966 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, //
5967 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, //
5968 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, //
5969 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, //
5970 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, //
5971 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, //
5972 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, //
5973 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, //
5974 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, //
5975 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, //
5976 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, //
5977 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, //
5978 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, //
5979 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, //
5980 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, //
5981 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, //
5982 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, //
5983 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, //
5984 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, //
5985 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, //
5986 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, //
5987 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, //
5988 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, //
5989 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, //
5990 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, //
5991 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, //
5992 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, //
5993 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, //
5994 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, //
5995 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, //
5996 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, //
5997 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, //
5998 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, //
5999 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, //
6000 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, //
6001 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, //
6002 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, //
6003 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, //
6004 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, //
6005 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, //
6006 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, //
6007 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, //
6008 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, //
6009 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, //
6010 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, //
6011 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, //
6012 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, //
6013 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, //
6014 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, //
6015 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, //
6016 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, //
6017 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, //
6018 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, //
6019 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, //
6020 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, //
6021 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, //
6022 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, //
6023 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, //
6024 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, //
6025 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, //
6026 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, //
6027 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, //
6028 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, //
6029 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, //
6030 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, //
6031 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, //
6032 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, //
6033 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, //
6034 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, //
6035 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, //
6036 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, //
6037 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, //
6038 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, //
6039 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, //
6040 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, //
6041 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, //
6042 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, //
6043 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, //
6044 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, //
6045 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, //
6046 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
6047
6048 const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
6049 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
6050 return BitCast(d, pairs + Set(du, 0x0100));
6051}
6052
6053template <typename T, size_t N>
6055 const uint64_t mask_bits) {
6056 HWY_DASSERT(mask_bits < 16);
6057
6058 // There are only 4 lanes, so we can afford to load the index vector directly.
6059 alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
6060 // PrintCompress32x4Tables
6061 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6062 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6063 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
6064 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6065 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
6066 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
6067 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
6068 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6069 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
6070 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
6071 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
6072 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
6073 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
6074 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
6075 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
6076 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6077 const Simd<T, N, 0> d;
6078 const Repartition<uint8_t, decltype(d)> d8;
6079 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6080}
6081
6082template <typename T, size_t N>
6084 const uint64_t mask_bits) {
6085 HWY_DASSERT(mask_bits < 16);
6086
6087 // There are only 4 lanes, so we can afford to load the index vector directly.
6088 alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
6089 // PrintCompressNot32x4Tables
6090 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
6091 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
6092 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
6093 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
6094 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
6095 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
6096 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6097 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6098 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
6099 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6100 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
6101 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
6102 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6103 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6104 12, 13, 14, 15};
6105 const Simd<T, N, 0> d;
6106 const Repartition<uint8_t, decltype(d)> d8;
6107 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6108}
6109
6110#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
6111
6112template <typename T, size_t N>
6113HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<8> /*tag*/,
6114 const uint64_t mask_bits) {
6115 HWY_DASSERT(mask_bits < 4);
6116
6117 // There are only 2 lanes, so we can afford to load the index vector directly.
6118 alignas(16) constexpr uint8_t u8_indices[64] = {
6119 // PrintCompress64x2Tables
6120 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6121 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6122 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6123 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6124
6125 const Simd<T, N, 0> d;
6126 const Repartition<uint8_t, decltype(d)> d8;
6127 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6128}
6129
6130template <typename T, size_t N>
6131HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<8> /*tag*/,
6132 const uint64_t mask_bits) {
6133 HWY_DASSERT(mask_bits < 4);
6134
6135 // There are only 2 lanes, so we can afford to load the index vector directly.
6136 alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
6137 // PrintCompressNot64x2Tables
6138 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6139 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6140 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6141 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6142
6143 const Simd<T, N, 0> d;
6144 const Repartition<uint8_t, decltype(d)> d8;
6145 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6146}
6147
6148#endif
6149
6150// Helper function called by both Compress and CompressStore - avoids a
6151// redundant BitsFromMask in the latter.
6152template <typename T, size_t N>
6153HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
6154 const auto idx =
6155 detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
6156 using D = Simd<T, N, 0>;
6157 const RebindToSigned<D> di;
6158 return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
6159}
6160
6161template <typename T, size_t N>
6162HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
6163 const auto idx =
6164 detail::IdxFromNotBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
6165 using D = Simd<T, N, 0>;
6166 const RebindToSigned<D> di;
6167 return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
6168}
6169
6170} // namespace detail
6171
6172// Single lane: no-op
6173template <typename T>
6175 return v;
6176}
6177
6178// Two lanes: conditional swap
6179template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
6181 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
6182 const Simd<T, N, 0> d;
6183 const Vec128<T, N> m = VecFromMask(d, mask);
6184 const Vec128<T, N> maskL = DupEven(m);
6185 const Vec128<T, N> maskH = DupOdd(m);
6186 const Vec128<T, N> swap = AndNot(maskL, maskH);
6187 return IfVecThenElse(swap, Shuffle01(v), v);
6188}
6189
6190// General case, 2 or 4 byte lanes
6191template <typename T, size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
6192HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
6193 return detail::Compress(v, detail::BitsFromMask(mask));
6194}
6195
6196// Single lane: no-op
6197template <typename T>
6199 return v;
6200}
6201
6202// Two lanes: conditional swap
6203template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6205 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
6206 const Full128<T> d;
6207 const Vec128<T> m = VecFromMask(d, mask);
6208 const Vec128<T> maskL = DupEven(m);
6209 const Vec128<T> maskH = DupOdd(m);
6210 const Vec128<T> swap = AndNot(maskH, maskL);
6211 return IfVecThenElse(swap, Shuffle01(v), v);
6212}
6213
6214// General case, 2 or 4 byte lanes
6215template <typename T, size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
6217 // For partial vectors, we cannot pull the Not() into the table because
6218 // BitsFromMask clears the upper bits.
6219 if (N < 16 / sizeof(T)) {
6220 return detail::Compress(v, detail::BitsFromMask(Not(mask)));
6221 }
6222 return detail::CompressNot(v, detail::BitsFromMask(mask));
6223}
6224
6225// ------------------------------ CompressBlocksNot
6230
6231// ------------------------------ CompressBits
6232
6233template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6235 const uint8_t* HWY_RESTRICT bits) {
6236 uint64_t mask_bits = 0;
6237 constexpr size_t kNumBytes = (N + 7) / 8;
6238 CopyBytes<kNumBytes>(bits, &mask_bits);
6239 if (N < 8) {
6240 mask_bits &= (1ull << N) - 1;
6241 }
6242
6243 return detail::Compress(v, mask_bits);
6244}
6245
6246// ------------------------------ CompressStore
6247template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6249 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
6250 const uint64_t mask_bits = detail::BitsFromMask(mask);
6251 StoreU(detail::Compress(v, mask_bits), d, unaligned);
6252 return PopCount(mask_bits);
6253}
6254
6255// ------------------------------ CompressBlendedStore
6256template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6258 Simd<T, N, 0> d,
6259 T* HWY_RESTRICT unaligned) {
6260 const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
6261 using TU = TFromD<decltype(du)>;
6262 const uint64_t mask_bits = detail::BitsFromMask(m);
6263 const size_t count = PopCount(mask_bits);
6264 const Mask128<T, N> store_mask = RebindMask(d, FirstN(du, count));
6265 const Vec128<TU, N> compressed = detail::Compress(BitCast(du, v), mask_bits);
6266 BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
6267 return count;
6268}
6269
6270// ------------------------------ CompressBitsStore
6271
6272template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6274 const uint8_t* HWY_RESTRICT bits,
6275 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
6276 uint64_t mask_bits = 0;
6277 constexpr size_t kNumBytes = (N + 7) / 8;
6278 CopyBytes<kNumBytes>(bits, &mask_bits);
6279 if (N < 8) {
6280 mask_bits &= (1ull << N) - 1;
6281 }
6282
6283 StoreU(detail::Compress(v, mask_bits), d, unaligned);
6284 return PopCount(mask_bits);
6285}
6286
6287// ------------------------------ LoadInterleaved2
6288
6289// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
6290#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
6291#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
6292#else
6293#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
6294#endif
6295
6296namespace detail {
6297#define HWY_NEON_BUILD_TPL_HWY_LOAD_INT
6298#define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from
6299
6300#if HWY_ARCH_ARM_A64
6301#define HWY_IF_LOAD_INT(T, N) HWY_IF_GE64(T, N)
6302#define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
6303#else
6304// Exclude 64x2 and f64x1, which are only supported on aarch64
6305#define HWY_IF_LOAD_INT(T, N) \
6306 hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
6307#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
6308 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
6309 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
6310 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
6311 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
6312 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
6313#endif // HWY_ARCH_ARM_A64
6314
6315// Must return raw tuple because Tuple2 lack a ctor, and we cannot use
6316// brace-initialization in HWY_NEON_DEF_FUNCTION because some functions return
6317// void.
6318#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
6319 decltype(Tuple2<type##_t, size>().raw)
6320// Tuple tag arg allows overloading (cannot just overload on return type)
6321#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
6322 const type##_t *from, Tuple2<type##_t, size>
6323HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved2, vld2, _, HWY_LOAD_INT)
6324#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
6325#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
6326
6327#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
6328 decltype(Tuple3<type##_t, size>().raw)
6329#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
6330 const type##_t *from, Tuple3<type##_t, size>
6331HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved3, vld3, _, HWY_LOAD_INT)
6332#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
6333#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
6334
6335#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
6336 decltype(Tuple4<type##_t, size>().raw)
6337#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
6338 const type##_t *from, Tuple4<type##_t, size>
6339HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT)
6340#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
6341#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
6342
6343#undef HWY_NEON_DEF_FUNCTION_LOAD_INT
6344#undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT
6345#undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT
6346} // namespace detail
6347
6348template <typename T, size_t N, HWY_IF_LOAD_INT(T, N)>
6350 const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
6351 Vec128<T, N>& v1) {
6352 auto raw = detail::LoadInterleaved2(unaligned, detail::Tuple2<T, N>());
6353 v0 = Vec128<T, N>(raw.val[0]);
6354 v1 = Vec128<T, N>(raw.val[1]);
6355}
6356
6357// <= 32 bits: avoid loading more than N bytes by copying to buffer
6358template <typename T, size_t N, HWY_IF_LE32(T, N)>
6359HWY_API void LoadInterleaved2(Simd<T, N, 0> /*tag*/,
6360 const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
6361 Vec128<T, N>& v1) {
6362 // The smallest vector registers are 64-bits and we want space for two.
6363 alignas(16) T buf[2 * 8 / sizeof(T)] = {};
6364 CopyBytes<N * 2 * sizeof(T)>(unaligned, buf);
6365 auto raw = detail::LoadInterleaved2(buf, detail::Tuple2<T, N>());
6366 v0 = Vec128<T, N>(raw.val[0]);
6367 v1 = Vec128<T, N>(raw.val[1]);
6368}
6369
6370#if HWY_ARCH_ARM_V7
6371// 64x2: split into two 64x1
6372template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6373HWY_API void LoadInterleaved2(Full128<T> d, T* HWY_RESTRICT unaligned,
6374 Vec128<T>& v0, Vec128<T>& v1) {
6375 const Half<decltype(d)> dh;
6376 VFromD<decltype(dh)> v00, v10, v01, v11;
6377 LoadInterleaved2(dh, unaligned, v00, v10);
6378 LoadInterleaved2(dh, unaligned + 2, v01, v11);
6379 v0 = Combine(d, v01, v00);
6380 v1 = Combine(d, v11, v10);
6381}
6382#endif // HWY_ARCH_ARM_V7
6383
6384// ------------------------------ LoadInterleaved3
6385
6386template <typename T, size_t N, HWY_IF_LOAD_INT(T, N)>
6388 const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
6389 Vec128<T, N>& v1, Vec128<T, N>& v2) {
6390 auto raw = detail::LoadInterleaved3(unaligned, detail::Tuple3<T, N>());
6391 v0 = Vec128<T, N>(raw.val[0]);
6392 v1 = Vec128<T, N>(raw.val[1]);
6393 v2 = Vec128<T, N>(raw.val[2]);
6394}
6395
6396// <= 32 bits: avoid writing more than N bytes by copying to buffer
6397template <typename T, size_t N, HWY_IF_LE32(T, N)>
6398HWY_API void LoadInterleaved3(Simd<T, N, 0> /*tag*/,
6399 const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
6400 Vec128<T, N>& v1, Vec128<T, N>& v2) {
6401 // The smallest vector registers are 64-bits and we want space for three.
6402 alignas(16) T buf[3 * 8 / sizeof(T)] = {};
6403 CopyBytes<N * 3 * sizeof(T)>(unaligned, buf);
6404 auto raw = detail::LoadInterleaved3(buf, detail::Tuple3<T, N>());
6405 v0 = Vec128<T, N>(raw.val[0]);
6406 v1 = Vec128<T, N>(raw.val[1]);
6407 v2 = Vec128<T, N>(raw.val[2]);
6408}
6409
6410#if HWY_ARCH_ARM_V7
6411// 64x2: split into two 64x1
6412template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6413HWY_API void LoadInterleaved3(Full128<T> d, const T* HWY_RESTRICT unaligned,
6414 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
6415 const Half<decltype(d)> dh;
6416 VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21;
6417 LoadInterleaved3(dh, unaligned, v00, v10, v20);
6418 LoadInterleaved3(dh, unaligned + 3, v01, v11, v21);
6419 v0 = Combine(d, v01, v00);
6420 v1 = Combine(d, v11, v10);
6421 v2 = Combine(d, v21, v20);
6422}
6423#endif // HWY_ARCH_ARM_V7
6424
6425// ------------------------------ LoadInterleaved4
6426
6427template <typename T, size_t N, HWY_IF_LOAD_INT(T, N)>
6429 const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
6430 Vec128<T, N>& v1, Vec128<T, N>& v2,
6431 Vec128<T, N>& v3) {
6432 auto raw = detail::LoadInterleaved4(unaligned, detail::Tuple4<T, N>());
6433 v0 = Vec128<T, N>(raw.val[0]);
6434 v1 = Vec128<T, N>(raw.val[1]);
6435 v2 = Vec128<T, N>(raw.val[2]);
6436 v3 = Vec128<T, N>(raw.val[3]);
6437}
6438
6439// <= 32 bits: avoid writing more than N bytes by copying to buffer
6440template <typename T, size_t N, HWY_IF_LE32(T, N)>
6441HWY_API void LoadInterleaved4(Simd<T, N, 0> /*tag*/,
6442 const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
6443 Vec128<T, N>& v1, Vec128<T, N>& v2,
6444 Vec128<T, N>& v3) {
6445 alignas(16) T buf[4 * 8 / sizeof(T)] = {};
6446 CopyBytes<N * 4 * sizeof(T)>(unaligned, buf);
6447 auto raw = detail::LoadInterleaved4(buf, detail::Tuple4<T, N>());
6448 v0 = Vec128<T, N>(raw.val[0]);
6449 v1 = Vec128<T, N>(raw.val[1]);
6450 v2 = Vec128<T, N>(raw.val[2]);
6451 v3 = Vec128<T, N>(raw.val[3]);
6452}
6453
6454#if HWY_ARCH_ARM_V7
6455// 64x2: split into two 64x1
6456template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6457HWY_API void LoadInterleaved4(Full128<T> d, const T* HWY_RESTRICT unaligned,
6458 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
6459 Vec128<T>& v3) {
6460 const Half<decltype(d)> dh;
6461 VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
6462 LoadInterleaved4(dh, unaligned, v00, v10, v20, v30);
6463 LoadInterleaved4(dh, unaligned + 4, v01, v11, v21, v31);
6464 v0 = Combine(d, v01, v00);
6465 v1 = Combine(d, v11, v10);
6466 v2 = Combine(d, v21, v20);
6467 v3 = Combine(d, v31, v30);
6468}
6469#endif // HWY_ARCH_ARM_V7
6470
6471#undef HWY_IF_LOAD_INT
6472
6473// ------------------------------ StoreInterleaved2
6474
6475namespace detail {
6476#define HWY_NEON_BUILD_TPL_HWY_STORE_INT
6477#define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void
6478#define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw
6479
6480#if HWY_ARCH_ARM_A64
6481#define HWY_IF_STORE_INT(T, N) HWY_IF_GE64(T, N)
6482#define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
6483#else
6484// Exclude 64x2 and f64x1, which are only supported on aarch64
6485#define HWY_IF_STORE_INT(T, N) \
6486 hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
6487#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
6488 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
6489 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
6490 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
6491 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
6492 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
6493#endif // HWY_ARCH_ARM_A64
6494
6495#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6496 Tuple2<type##_t, size> tup, type##_t *to
6497HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved2, vst2, _, HWY_STORE_INT)
6498#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6499
6500#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6501 Tuple3<type##_t, size> tup, type##_t *to
6502HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved3, vst3, _, HWY_STORE_INT)
6503#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6504
6505#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6506 Tuple4<type##_t, size> tup, type##_t *to
6507HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved4, vst4, _, HWY_STORE_INT)
6508#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6509
6510#undef HWY_NEON_DEF_FUNCTION_STORE_INT
6511#undef HWY_NEON_BUILD_TPL_HWY_STORE_INT
6512#undef HWY_NEON_BUILD_RET_HWY_STORE_INT
6513#undef HWY_NEON_BUILD_ARG_HWY_STORE_INT
6514} // namespace detail
6515
6516template <typename T, size_t N, HWY_IF_STORE_INT(T, N)>
6518 Simd<T, N, 0> /*tag*/,
6519 T* HWY_RESTRICT unaligned) {
6520 detail::Tuple2<T, N> tup = {{{v0.raw, v1.raw}}};
6521 detail::StoreInterleaved2(tup, unaligned);
6522}
6523
6524// <= 32 bits: avoid writing more than N bytes by copying to buffer
6525template <typename T, size_t N, HWY_IF_LE32(T, N)>
6526HWY_API void StoreInterleaved2(const Vec128<T, N> v0, const Vec128<T, N> v1,
6527 Simd<T, N, 0> /*tag*/,
6528 T* HWY_RESTRICT unaligned) {
6529 alignas(16) T buf[2 * 8 / sizeof(T)];
6530 detail::Tuple2<T, N> tup = {{{v0.raw, v1.raw}}};
6531 detail::StoreInterleaved2(tup, buf);
6532 CopyBytes<N * 2 * sizeof(T)>(buf, unaligned);
6533}
6534
6535#if HWY_ARCH_ARM_V7
6536// 64x2: split into two 64x1
6537template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6538HWY_API void StoreInterleaved2(const Vec128<T> v0, const Vec128<T> v1,
6539 Full128<T> d, T* HWY_RESTRICT unaligned) {
6540 const Half<decltype(d)> dh;
6541 StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh, unaligned);
6542 StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh, unaligned + 2);
6543}
6544#endif // HWY_ARCH_ARM_V7
6545
6546// ------------------------------ StoreInterleaved3
6547
6548template <typename T, size_t N, HWY_IF_STORE_INT(T, N)>
6550 const Vec128<T, N> v2, Simd<T, N, 0> /*tag*/,
6551 T* HWY_RESTRICT unaligned) {
6552 detail::Tuple3<T, N> tup = {{{v0.raw, v1.raw, v2.raw}}};
6553 detail::StoreInterleaved3(tup, unaligned);
6554}
6555
6556// <= 32 bits: avoid writing more than N bytes by copying to buffer
6557template <typename T, size_t N, HWY_IF_LE32(T, N)>
6558HWY_API void StoreInterleaved3(const Vec128<T, N> v0, const Vec128<T, N> v1,
6559 const Vec128<T, N> v2, Simd<T, N, 0> /*tag*/,
6560 T* HWY_RESTRICT unaligned) {
6561 alignas(16) T buf[3 * 8 / sizeof(T)];
6562 detail::Tuple3<T, N> tup = {{{v0.raw, v1.raw, v2.raw}}};
6563 detail::StoreInterleaved3(tup, buf);
6564 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
6565}
6566
6567#if HWY_ARCH_ARM_V7
6568// 64x2: split into two 64x1
6569template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6570HWY_API void StoreInterleaved3(const Vec128<T> v0, const Vec128<T> v1,
6571 const Vec128<T> v2, Full128<T> d,
6572 T* HWY_RESTRICT unaligned) {
6573 const Half<decltype(d)> dh;
6574 StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh,
6575 unaligned);
6576 StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh,
6577 unaligned + 3);
6578}
6579#endif // HWY_ARCH_ARM_V7
6580
6581// ------------------------------ StoreInterleaved4
6582
6583template <typename T, size_t N, HWY_IF_STORE_INT(T, N)>
6585 const Vec128<T, N> v2, const Vec128<T, N> v3,
6586 Simd<T, N, 0> /*tag*/,
6587 T* HWY_RESTRICT unaligned) {
6588 detail::Tuple4<T, N> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
6589 detail::StoreInterleaved4(tup, unaligned);
6590}
6591
6592// <= 32 bits: avoid writing more than N bytes by copying to buffer
6593template <typename T, size_t N, HWY_IF_LE32(T, N)>
6594HWY_API void StoreInterleaved4(const Vec128<T, N> v0, const Vec128<T, N> v1,
6595 const Vec128<T, N> v2, const Vec128<T, N> v3,
6596 Simd<T, N, 0> /*tag*/,
6597 T* HWY_RESTRICT unaligned) {
6598 alignas(16) T buf[4 * 8 / sizeof(T)];
6599 detail::Tuple4<T, N> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
6600 detail::StoreInterleaved4(tup, buf);
6601 CopyBytes<N * 4 * sizeof(T)>(buf, unaligned);
6602}
6603
6604#if HWY_ARCH_ARM_V7
6605// 64x2: split into two 64x1
6606template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6607HWY_API void StoreInterleaved4(const Vec128<T> v0, const Vec128<T> v1,
6608 const Vec128<T> v2, const Vec128<T> v3,
6609 Full128<T> d, T* HWY_RESTRICT unaligned) {
6610 const Half<decltype(d)> dh;
6611 StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2),
6612 LowerHalf(dh, v3), dh, unaligned);
6613 StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2),
6614 UpperHalf(dh, v3), dh, unaligned + 4);
6615}
6616#endif // HWY_ARCH_ARM_V7
6617
6618#undef HWY_IF_STORE_INT
6619
6620// ------------------------------ Lt128
6621
6622template <typename T, size_t N, HWY_IF_LE128(T, N)>
6624 Vec128<T, N> b) {
6625 static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
6626 // Truth table of Eq and Lt for Hi and Lo u64.
6627 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
6628 // =H =L cH cL | out = cH | (=H & cL)
6629 // 0 0 0 0 | 0
6630 // 0 0 0 1 | 0
6631 // 0 0 1 0 | 1
6632 // 0 0 1 1 | 1
6633 // 0 1 0 0 | 0
6634 // 0 1 0 1 | 0
6635 // 0 1 1 0 | 1
6636 // 1 0 0 0 | 0
6637 // 1 0 0 1 | 1
6638 // 1 1 0 0 | 0
6639 const Mask128<T, N> eqHL = Eq(a, b);
6640 const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
6641 // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
6642 // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
6643 // comparison result leftwards requires only 4. IfThenElse compiles to the
6644 // same code as OrAnd().
6645 const Vec128<T, N> ltLx = DupEven(ltHL);
6646 const Vec128<T, N> outHx = IfThenElse(eqHL, ltLx, ltHL);
6647 return MaskFromVec(DupOdd(outHx));
6648}
6649
6650template <typename T, size_t N, HWY_IF_LE128(T, N)>
6652 Vec128<T, N> b) {
6653 const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
6654 return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
6655}
6656
6657// ------------------------------ Eq128
6658
6659template <typename T, size_t N, HWY_IF_LE128(T, N)>
6661 Vec128<T, N> b) {
6662 static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
6663 const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
6664 return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
6665}
6666
6667template <typename T, size_t N, HWY_IF_LE128(T, N)>
6669 Vec128<T, N> b) {
6670 const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
6671 return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
6672}
6673
6674// ------------------------------ Ne128
6675
6676template <typename T, size_t N, HWY_IF_LE128(T, N)>
6678 Vec128<T, N> b) {
6679 static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
6680 const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
6681 return MaskFromVec(Or(Reverse2(d, neHL), neHL));
6682}
6683
6684template <typename T, size_t N, HWY_IF_LE128(T, N)>
6686 Vec128<T, N> b) {
6687 const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
6688 return MaskFromVec(InterleaveUpper(d, neHL, neHL));
6689}
6690
6691// ------------------------------ Min128, Max128 (Lt128)
6692
6693// Without a native OddEven, it seems infeasible to go faster than Lt128.
6694template <class D>
6696 return IfThenElse(Lt128(d, a, b), a, b);
6697}
6698
6699template <class D>
6701 return IfThenElse(Lt128(d, b, a), a, b);
6702}
6703
6704template <class D>
6706 return IfThenElse(Lt128Upper(d, a, b), a, b);
6707}
6708
6709template <class D>
6711 return IfThenElse(Lt128Upper(d, b, a), a, b);
6712}
6713
6714namespace detail { // for code folding
6715#if HWY_ARCH_ARM_V7
6716#undef vuzp1_s8
6717#undef vuzp1_u8
6718#undef vuzp1_s16
6719#undef vuzp1_u16
6720#undef vuzp1_s32
6721#undef vuzp1_u32
6722#undef vuzp1_f32
6723#undef vuzp1q_s8
6724#undef vuzp1q_u8
6725#undef vuzp1q_s16
6726#undef vuzp1q_u16
6727#undef vuzp1q_s32
6728#undef vuzp1q_u32
6729#undef vuzp1q_f32
6730#undef vuzp2_s8
6731#undef vuzp2_u8
6732#undef vuzp2_s16
6733#undef vuzp2_u16
6734#undef vuzp2_s32
6735#undef vuzp2_u32
6736#undef vuzp2_f32
6737#undef vuzp2q_s8
6738#undef vuzp2q_u8
6739#undef vuzp2q_s16
6740#undef vuzp2q_u16
6741#undef vuzp2q_s32
6742#undef vuzp2q_u32
6743#undef vuzp2q_f32
6744#undef vzip1_s8
6745#undef vzip1_u8
6746#undef vzip1_s16
6747#undef vzip1_u16
6748#undef vzip1_s32
6749#undef vzip1_u32
6750#undef vzip1_f32
6751#undef vzip1q_s8
6752#undef vzip1q_u8
6753#undef vzip1q_s16
6754#undef vzip1q_u16
6755#undef vzip1q_s32
6756#undef vzip1q_u32
6757#undef vzip1q_f32
6758#undef vzip2_s8
6759#undef vzip2_u8
6760#undef vzip2_s16
6761#undef vzip2_u16
6762#undef vzip2_s32
6763#undef vzip2_u32
6764#undef vzip2_f32
6765#undef vzip2q_s8
6766#undef vzip2q_u8
6767#undef vzip2q_s16
6768#undef vzip2q_u16
6769#undef vzip2q_s32
6770#undef vzip2q_u32
6771#undef vzip2q_f32
6772#endif
6773
6774#undef HWY_NEON_BUILD_ARG_1
6775#undef HWY_NEON_BUILD_ARG_2
6776#undef HWY_NEON_BUILD_ARG_3
6777#undef HWY_NEON_BUILD_PARAM_1
6778#undef HWY_NEON_BUILD_PARAM_2
6779#undef HWY_NEON_BUILD_PARAM_3
6780#undef HWY_NEON_BUILD_RET_1
6781#undef HWY_NEON_BUILD_RET_2
6782#undef HWY_NEON_BUILD_RET_3
6783#undef HWY_NEON_BUILD_TPL_1
6784#undef HWY_NEON_BUILD_TPL_2
6785#undef HWY_NEON_BUILD_TPL_3
6786#undef HWY_NEON_DEF_FUNCTION
6787#undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
6788#undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
6789#undef HWY_NEON_DEF_FUNCTION_FLOAT_64
6790#undef HWY_NEON_DEF_FUNCTION_FULL_UI
6791#undef HWY_NEON_DEF_FUNCTION_INT_16
6792#undef HWY_NEON_DEF_FUNCTION_INT_32
6793#undef HWY_NEON_DEF_FUNCTION_INT_8
6794#undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
6795#undef HWY_NEON_DEF_FUNCTION_INTS
6796#undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
6797#undef HWY_NEON_DEF_FUNCTION_TPL
6798#undef HWY_NEON_DEF_FUNCTION_UIF81632
6799#undef HWY_NEON_DEF_FUNCTION_UINT_16
6800#undef HWY_NEON_DEF_FUNCTION_UINT_32
6801#undef HWY_NEON_DEF_FUNCTION_UINT_8
6802#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
6803#undef HWY_NEON_DEF_FUNCTION_UINTS
6804#undef HWY_NEON_EVAL
6805} // namespace detail
6806
6807// NOLINTNEXTLINE(google-readability-namespace-comments)
6808} // namespace HWY_NAMESPACE
6809} // namespace hwy
#define HWY_NEON_BUILD_RET_2(type, size)
Definition arm_neon-inl.h:53
HWY_AFTER_NAMESPACE()
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
Definition arm_neon-inl.h:166
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)
Definition arm_neon-inl.h:189
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)
Definition arm_neon-inl.h:199
#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:178
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
Definition arm_neon-inl.h:145
#define HWY_NEON_BUILD_ARG_3
Definition arm_neon-inl.h:68
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
Definition arm_neon-inl.h:140
#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args)
Definition arm_neon-inl.h:6487
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)
Definition arm_neon-inl.h:98
#define HWY_NEON_BUILD_ARG_2
Definition arm_neon-inl.h:67
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)
Definition arm_neon-inl.h:194
#define HWY_NEON_BUILD_PARAM_2(type, size)
Definition arm_neon-inl.h:58
#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args)
Definition arm_neon-inl.h:209
#define HWY_NEON_BUILD_TPL_1
Definition arm_neon-inl.h:46
#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix)
Definition arm_neon-inl.h:5213
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
Definition arm_neon-inl.h:128
#define HWY_NEON_BUILD_TPL_2
Definition arm_neon-inl.h:47
HWY_BEFORE_NAMESPACE()
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:172
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)
Definition arm_neon-inl.h:83
#define HWY_NEON_EVAL(func,...)
Definition arm_neon-inl.h:77
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)
Definition arm_neon-inl.h:114
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)
Definition arm_neon-inl.h:121
#define HWY_NEON_BUILD_TPL_3
Definition arm_neon-inl.h:48
#define HWY_NEON_BUILD_RET_3(type, size)
Definition arm_neon-inl.h:54
#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args)
Definition arm_neon-inl.h:203
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
Definition arm_neon-inl.h:184
#define HWY_NEON_BUILD_PARAM_1(type, size)
Definition arm_neon-inl.h:57
#define HWY_NEON_BUILD_RET_1(type, size)
Definition arm_neon-inl.h:52
#define HWY_NEON_BUILD_ARG_1
Definition arm_neon-inl.h:66
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)
Definition arm_neon-inl.h:106
#define HWY_NEON_BUILD_PARAM_3(type, size)
Definition arm_neon-inl.h:60
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
Definition arm_neon-inl.h:134
#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args)
Definition arm_neon-inl.h:6307
#define HWY_IF_FLOAT(T)
Definition base.h:417
#define HWY_RESTRICT
Definition base.h:64
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:78
#define HWY_IF_LE64(T, N)
Definition base.h:407
#define HWY_API
Definition base.h:129
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_IF_NOT_FLOAT(T)
Definition base.h:418
#define HWY_INLINE
Definition base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:79
#define HWY_DASSERT(condition)
Definition base.h:238
#define HWY_ASSERT(condition)
Definition base.h:192
#define HWY_CONCAT(a, b)
Definition base.h:132
#define HWY_IF_UNSIGNED(T)
Definition base.h:414
Definition arm_neon-inl.h:825
HWY_INLINE Mask128()
Definition arm_neon-inl.h:830
Mask128(const Mask128 &)=default
Mask128 & operator=(const Mask128 &)=default
HWY_INLINE Mask128(const Raw raw)
Definition arm_neon-inl.h:833
Raw raw
Definition arm_neon-inl.h:835
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition arm_neon-inl.h:827
Definition arm_neon-inl.h:778
HWY_INLINE Vec128()
Definition arm_neon-inl.h:785
T PrivateT
Definition arm_neon-inl.h:782
HWY_INLINE Vec128(const Raw raw)
Definition arm_neon-inl.h:788
Vec128(const Vec128 &)=default
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition arm_neon-inl.h:795
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:779
Raw raw
Definition arm_neon-inl.h:814
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition arm_neon-inl.h:801
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition arm_neon-inl.h:810
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition arm_neon-inl.h:807
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition arm_neon-inl.h:792
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition arm_neon-inl.h:804
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition arm_neon-inl.h:798
#define HWY_COMPILER_GCC_ACTUAL
Definition detect_compiler_arch.h:109
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5447
HWY_INLINE Vec128< float > ReciprocalNewtonRaphsonStep(const Vec128< float > recip, const Vec128< float > divisor)
Definition arm_neon-inl.h:1748
HWY_INLINE Vec128< uint8_t > Load8Bytes(Full128< uint8_t >, const uint8_t *bytes)
Definition arm_neon-inl.h:5736
HWY_INLINE Vec128< T, N > Set64(Simd< T, N, 0 >, uint64_t mask_bits)
Definition arm_neon-inl.h:5354
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition arm_neon-inl.h:5902
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6153
HWY_INLINE uint64_t NibblesFromMask(const Full128< T > d, Mask128< T > mask)
Definition arm_neon-inl.h:5421
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:888
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5063
HWY_INLINE Vec128< T, 1 > SumOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5058
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6162
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5609
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3418
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:861
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2080
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition arm_neon-inl.h:5750
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5068
constexpr uint64_t OnlyActive(uint64_t bits)
Definition arm_neon-inl.h:5589
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4235
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition arm_neon-inl.h:5364
HWY_INLINE Vec128< float > ReciprocalSqrtStep(const Vec128< float > root, const Vec128< float > recip)
Definition arm_neon-inl.h:1899
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1631
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2445
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition arm_neon-inl.h:6349
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6584
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4453
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6677
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
typename D::Twice Twice
Definition ops/shared-inl.h:231
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:842
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
typename V::PrivateT TFromV
Definition arm_neon-inl.h:845
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6623
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1635
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition arm_neon-inl.h:6387
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition arm_neon-inl.h:6428
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6517
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6651
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6549
typename D::T TFromD
Definition ops/shared-inl.h:203
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition arm_neon-inl.h:4977
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition base.h:906
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:806
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:924
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:607
double float64_t
Definition base.h:303
typename EnableIfT< Condition >::type EnableIf
Definition base.h:383
float float32_t
Definition base.h:302
HWY_API size_t PopCount(uint64_t x)
Definition base.h:865
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:593
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:595
#define HWY_ALIGN
Definition set_macros-inl.h:83
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
Definition arm_neon-inl.h:5729
Definition arm_neon-inl.h:3968
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:3969
Definition ops/shared-inl.h:52
uint16x4_t type
Definition arm_neon-inl.h:706
uint16x8_t type
Definition arm_neon-inl.h:643
uint16x4_t type
Definition arm_neon-inl.h:701
uint16x8_t type
Definition arm_neon-inl.h:638
float32x2_t type
Definition arm_neon-inl.h:711
float32x4_t type
Definition arm_neon-inl.h:648
int16x4_t type
Definition arm_neon-inl.h:686
int16x8_t type
Definition arm_neon-inl.h:623
int32x2_t type
Definition arm_neon-inl.h:691
int32x4_t type
Definition arm_neon-inl.h:628
int64x1_t type
Definition arm_neon-inl.h:696
int64x2_t type
Definition arm_neon-inl.h:633
int8x16_t type
Definition arm_neon-inl.h:618
int8x8_t type
Definition arm_neon-inl.h:681
uint16x4_t type
Definition arm_neon-inl.h:666
uint16x8_t type
Definition arm_neon-inl.h:603
uint32x2_t type
Definition arm_neon-inl.h:671
uint32x4_t type
Definition arm_neon-inl.h:608
uint64x1_t type
Definition arm_neon-inl.h:676
uint64x2_t type
Definition arm_neon-inl.h:613
uint8x16_t type
Definition arm_neon-inl.h:598
uint8x8_t type
Definition arm_neon-inl.h:661
Definition x86_128-inl.h:55
__v128_u type
Definition wasm_128-inl.h:61
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:3639
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition arm_neon-inl.h:3646
Definition arm_neon-inl.h:3617
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:3627
HWY_INLINE Vec128< T > operator()(const Vec128< T > v)
Definition arm_neon-inl.h:3620
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:3669
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition arm_neon-inl.h:3676
Definition arm_neon-inl.h:3652
HWY_INLINE Vec128< T, N > operator()(Vec128< T, N > v)
Definition arm_neon-inl.h:3654
uint16x8x2_t raw
Definition arm_neon-inl.h:364
uint16x4x2_t raw
Definition arm_neon-inl.h:368
uint16x8x2_t raw
Definition arm_neon-inl.h:356
uint16x4x2_t raw
Definition arm_neon-inl.h:360
float32x4x2_t raw
Definition arm_neon-inl.h:373
float32x2x2_t raw
Definition arm_neon-inl.h:377
int16x8x2_t raw
Definition arm_neon-inl.h:315
int16x4x2_t raw
Definition arm_neon-inl.h:319
int32x4x2_t raw
Definition arm_neon-inl.h:331
int32x2x2_t raw
Definition arm_neon-inl.h:335
int64x2x2_t raw
Definition arm_neon-inl.h:347
int64x1x2_t raw
Definition arm_neon-inl.h:351
int8x16x2_t raw
Definition arm_neon-inl.h:299
int8x8x2_t raw
Definition arm_neon-inl.h:303
uint16x8x2_t raw
Definition arm_neon-inl.h:307
uint16x4x2_t raw
Definition arm_neon-inl.h:311
uint32x4x2_t raw
Definition arm_neon-inl.h:323
uint32x2x2_t raw
Definition arm_neon-inl.h:327
uint64x2x2_t raw
Definition arm_neon-inl.h:339
uint64x1x2_t raw
Definition arm_neon-inl.h:343
uint8x16x2_t raw
Definition arm_neon-inl.h:291
uint8x8x2_t raw
Definition arm_neon-inl.h:295
Definition arm_neon-inl.h:283
uint16x8x3_t raw
Definition arm_neon-inl.h:465
uint16x4x3_t raw
Definition arm_neon-inl.h:469
uint16x8x3_t raw
Definition arm_neon-inl.h:457
uint16x4x3_t raw
Definition arm_neon-inl.h:461
float32x4x3_t raw
Definition arm_neon-inl.h:474
float32x2x3_t raw
Definition arm_neon-inl.h:478
int16x8x3_t raw
Definition arm_neon-inl.h:416
int16x4x3_t raw
Definition arm_neon-inl.h:420
int32x4x3_t raw
Definition arm_neon-inl.h:432
int32x2x3_t raw
Definition arm_neon-inl.h:436
int64x2x3_t raw
Definition arm_neon-inl.h:448
int64x1x3_t raw
Definition arm_neon-inl.h:452
int8x16x3_t raw
Definition arm_neon-inl.h:400
int8x8x3_t raw
Definition arm_neon-inl.h:404
uint16x8x3_t raw
Definition arm_neon-inl.h:408
uint16x4x3_t raw
Definition arm_neon-inl.h:412
uint32x4x3_t raw
Definition arm_neon-inl.h:424
uint32x2x3_t raw
Definition arm_neon-inl.h:428
uint64x2x3_t raw
Definition arm_neon-inl.h:440
uint64x1x3_t raw
Definition arm_neon-inl.h:444
uint8x16x3_t raw
Definition arm_neon-inl.h:392
uint8x8x3_t raw
Definition arm_neon-inl.h:396
Definition arm_neon-inl.h:285
uint16x8x4_t raw
Definition arm_neon-inl.h:566
uint16x4x4_t raw
Definition arm_neon-inl.h:570
uint16x8x4_t raw
Definition arm_neon-inl.h:558
uint16x4x4_t raw
Definition arm_neon-inl.h:562
float32x4x4_t raw
Definition arm_neon-inl.h:575
float32x2x4_t raw
Definition arm_neon-inl.h:579
int16x8x4_t raw
Definition arm_neon-inl.h:517
int16x4x4_t raw
Definition arm_neon-inl.h:521
int32x4x4_t raw
Definition arm_neon-inl.h:533
int32x2x4_t raw
Definition arm_neon-inl.h:537
int64x2x4_t raw
Definition arm_neon-inl.h:549
int64x1x4_t raw
Definition arm_neon-inl.h:553
int8x16x4_t raw
Definition arm_neon-inl.h:501
int8x8x4_t raw
Definition arm_neon-inl.h:505
uint16x8x4_t raw
Definition arm_neon-inl.h:509
uint16x4x4_t raw
Definition arm_neon-inl.h:513
uint32x4x4_t raw
Definition arm_neon-inl.h:525
uint32x2x4_t raw
Definition arm_neon-inl.h:529
uint64x2x4_t raw
Definition arm_neon-inl.h:541
uint64x1x4_t raw
Definition arm_neon-inl.h:545
uint8x16x4_t raw
Definition arm_neon-inl.h:493
uint8x8x4_t raw
Definition arm_neon-inl.h:497
Definition arm_neon-inl.h:287
Definition base.h:435
Definition base.h:296
Definition base.h:291