Grok 10.0.5
wasm_128-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 128-bit WASM vectors and operations.
17// External include guard in highway.h - see comment there.
18
19#include <stddef.h>
20#include <stdint.h>
21#include <wasm_simd128.h>
22
23#include "hwy/base.h"
24#include "hwy/ops/shared-inl.h"
25
26#ifdef HWY_WASM_OLD_NAMES
27#define wasm_i8x16_shuffle wasm_v8x16_shuffle
28#define wasm_i16x8_shuffle wasm_v16x8_shuffle
29#define wasm_i32x4_shuffle wasm_v32x4_shuffle
30#define wasm_i64x2_shuffle wasm_v64x2_shuffle
31#define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
32#define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
33#define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
34#define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
35#define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
36#define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
37#define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
38#define wasm_u8x16_add_sat wasm_u8x16_add_saturate
39#define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
40#define wasm_u16x8_add_sat wasm_u16x8_add_saturate
41#define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
42#define wasm_i8x16_add_sat wasm_i8x16_add_saturate
43#define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
44#define wasm_i16x8_add_sat wasm_i16x8_add_saturate
45#define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
46#endif
47
49namespace hwy {
50namespace HWY_NAMESPACE {
51
52#if HWY_TARGET == HWY_WASM_EMU256
53template <typename T>
54using Full256 = Simd<T, 32 / sizeof(T), 0>;
55#endif
56
57namespace detail {
58
59template <typename T>
60struct Raw128 {
61 using type = __v128_u;
62};
63template <>
64struct Raw128<float> {
65 using type = __f32x4;
66};
67
68} // namespace detail
69
70template <typename T, size_t N = 16 / sizeof(T)>
71class Vec128 {
72 using Raw = typename detail::Raw128<T>::type;
73
74 public:
75 using PrivateT = T; // only for DFromV
76 static constexpr size_t kPrivateN = N; // only for DFromV
77
78 // Compound assignment. Only usable if there is a corresponding non-member
79 // binary operator overload. For example, only f32 and f64 support division.
81 return *this = (*this * other);
82 }
84 return *this = (*this / other);
85 }
87 return *this = (*this + other);
88 }
90 return *this = (*this - other);
91 }
93 return *this = (*this & other);
94 }
96 return *this = (*this | other);
97 }
99 return *this = (*this ^ other);
100 }
101
102 Raw raw;
103};
104
105template <typename T>
106using Vec64 = Vec128<T, 8 / sizeof(T)>;
107
108template <typename T>
109using Vec32 = Vec128<T, 4 / sizeof(T)>;
110
111template <typename T>
112using Vec16 = Vec128<T, 2 / sizeof(T)>;
113
114// FF..FF or 0.
115template <typename T, size_t N = 16 / sizeof(T)>
116struct Mask128 {
118};
119
120template <class V>
122
123template <class V>
124using TFromV = typename V::PrivateT;
125
126// ------------------------------ BitCast
127
128namespace detail {
129
130HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; }
131HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) {
132 return static_cast<__v128_u>(v);
133}
134HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) {
135 return static_cast<__v128_u>(v);
136}
137
138template <typename T, size_t N>
140 return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
141}
142
143// Cannot rely on function overloading because return types differ.
144template <typename T>
146 HWY_INLINE __v128_u operator()(__v128_u v) { return v; }
147};
148template <>
150 HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); }
151};
152
153template <typename T, size_t N>
155 Vec128<uint8_t, N * sizeof(T)> v) {
157}
158
159} // namespace detail
160
161template <typename T, size_t N, typename FromT>
162HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
163 Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
165}
166
167// ------------------------------ Zero
168
169// Returns an all-zero vector/part.
170template <typename T, size_t N, HWY_IF_LE128(T, N)>
172 return Vec128<T, N>{wasm_i32x4_splat(0)};
173}
174template <size_t N, HWY_IF_LE128(float, N)>
176 return Vec128<float, N>{wasm_f32x4_splat(0.0f)};
177}
178
179template <class D>
180using VFromD = decltype(Zero(D()));
181
182// ------------------------------ Set
183
184// Returns a vector/part with all lanes set to "t".
185template <size_t N, HWY_IF_LE128(uint8_t, N)>
187 return Vec128<uint8_t, N>{wasm_i8x16_splat(static_cast<int8_t>(t))};
188}
189template <size_t N, HWY_IF_LE128(uint16_t, N)>
191 const uint16_t t) {
192 return Vec128<uint16_t, N>{wasm_i16x8_splat(static_cast<int16_t>(t))};
193}
194template <size_t N, HWY_IF_LE128(uint32_t, N)>
196 const uint32_t t) {
197 return Vec128<uint32_t, N>{wasm_i32x4_splat(static_cast<int32_t>(t))};
198}
199template <size_t N, HWY_IF_LE128(uint64_t, N)>
201 const uint64_t t) {
202 return Vec128<uint64_t, N>{wasm_i64x2_splat(static_cast<int64_t>(t))};
203}
204
205template <size_t N, HWY_IF_LE128(int8_t, N)>
207 return Vec128<int8_t, N>{wasm_i8x16_splat(t)};
208}
209template <size_t N, HWY_IF_LE128(int16_t, N)>
211 return Vec128<int16_t, N>{wasm_i16x8_splat(t)};
212}
213template <size_t N, HWY_IF_LE128(int32_t, N)>
215 return Vec128<int32_t, N>{wasm_i32x4_splat(t)};
216}
217template <size_t N, HWY_IF_LE128(int64_t, N)>
219 return Vec128<int64_t, N>{wasm_i64x2_splat(t)};
220}
221
222template <size_t N, HWY_IF_LE128(float, N)>
224 return Vec128<float, N>{wasm_f32x4_splat(t)};
225}
226
227HWY_DIAGNOSTICS(push)
228HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
229
230// Returns a vector with uninitialized elements.
231template <typename T, size_t N, HWY_IF_LE128(T, N)>
233 return Zero(d);
234}
235
237
238// Returns a vector with lane i=[0, N) set to "first" + i.
239template <typename T, size_t N, typename T2, HWY_IF_LE128(T, N)>
240Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
241 HWY_ALIGN T lanes[16 / sizeof(T)];
242 for (size_t i = 0; i < 16 / sizeof(T); ++i) {
243 lanes[i] =
244 AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
245 }
246 return Load(d, lanes);
247}
248
249// ================================================== ARITHMETIC
250
251// ------------------------------ Addition
252
253// Unsigned
254template <size_t N>
256 const Vec128<uint8_t, N> b) {
257 return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
258}
259template <size_t N>
261 const Vec128<uint16_t, N> b) {
262 return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
263}
264template <size_t N>
266 const Vec128<uint32_t, N> b) {
267 return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
268}
269template <size_t N>
271 const Vec128<uint64_t, N> b) {
272 return Vec128<uint64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
273}
274
275// Signed
276template <size_t N>
278 const Vec128<int8_t, N> b) {
279 return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
280}
281template <size_t N>
283 const Vec128<int16_t, N> b) {
284 return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
285}
286template <size_t N>
288 const Vec128<int32_t, N> b) {
289 return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
290}
291template <size_t N>
293 const Vec128<int64_t, N> b) {
294 return Vec128<int64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
295}
296
297// Float
298template <size_t N>
300 const Vec128<float, N> b) {
301 return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)};
302}
303
304// ------------------------------ Subtraction
305
306// Unsigned
307template <size_t N>
309 const Vec128<uint8_t, N> b) {
310 return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
311}
312template <size_t N>
317template <size_t N>
319 const Vec128<uint32_t, N> b) {
320 return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
321}
322template <size_t N>
324 const Vec128<uint64_t, N> b) {
325 return Vec128<uint64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
326}
327
328// Signed
329template <size_t N>
331 const Vec128<int8_t, N> b) {
332 return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
333}
334template <size_t N>
336 const Vec128<int16_t, N> b) {
337 return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
338}
339template <size_t N>
341 const Vec128<int32_t, N> b) {
342 return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
343}
344template <size_t N>
346 const Vec128<int64_t, N> b) {
347 return Vec128<int64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
348}
349
350// Float
351template <size_t N>
353 const Vec128<float, N> b) {
354 return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)};
355}
356
357// ------------------------------ SaturatedAdd
358
359// Returns a + b clamped to the destination range.
360
361// Unsigned
362template <size_t N>
364 const Vec128<uint8_t, N> b) {
365 return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
366}
367template <size_t N>
369 const Vec128<uint16_t, N> b) {
370 return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
371}
372
373// Signed
374template <size_t N>
376 const Vec128<int8_t, N> b) {
377 return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
378}
379template <size_t N>
381 const Vec128<int16_t, N> b) {
382 return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
383}
384
385// ------------------------------ SaturatedSub
386
387// Returns a - b clamped to the destination range.
388
389// Unsigned
390template <size_t N>
392 const Vec128<uint8_t, N> b) {
393 return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
394}
395template <size_t N>
397 const Vec128<uint16_t, N> b) {
398 return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
399}
400
401// Signed
402template <size_t N>
404 const Vec128<int8_t, N> b) {
405 return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
406}
407template <size_t N>
409 const Vec128<int16_t, N> b) {
410 return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
411}
412
413// ------------------------------ Average
414
415// Returns (a + b + 1) / 2
416
417// Unsigned
418template <size_t N>
420 const Vec128<uint8_t, N> b) {
421 return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)};
422}
423template <size_t N>
425 const Vec128<uint16_t, N> b) {
426 return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
427}
428
429// ------------------------------ Absolute value
430
431// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
432template <size_t N>
434 return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)};
435}
436template <size_t N>
438 return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)};
439}
440template <size_t N>
442 return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
443}
444template <size_t N>
446 return Vec128<int64_t, N>{wasm_i64x2_abs(v.raw)};
447}
448
449template <size_t N>
451 return Vec128<float, N>{wasm_f32x4_abs(v.raw)};
452}
453
454// ------------------------------ Shift lanes by constant #bits
455
456// Unsigned
457template <int kBits, size_t N>
459 return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
460}
461template <int kBits, size_t N>
463 return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)};
464}
465template <int kBits, size_t N>
467 return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
468}
469template <int kBits, size_t N>
471 return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
472}
473template <int kBits, size_t N>
475 return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)};
476}
477template <int kBits, size_t N>
479 return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, kBits)};
480}
481
482// Signed
483template <int kBits, size_t N>
485 return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
486}
487template <int kBits, size_t N>
489 return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)};
490}
491template <int kBits, size_t N>
493 return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
494}
495template <int kBits, size_t N>
497 return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
498}
499template <int kBits, size_t N>
501 return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
502}
503template <int kBits, size_t N>
505 return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, kBits)};
506}
507
508// 8-bit
509template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
511 const DFromV<decltype(v)> d8;
512 // Use raw instead of BitCast to support N=1.
513 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
514 return kBits == 1
515 ? (v + v)
516 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
517}
518
519template <int kBits, size_t N>
521 const DFromV<decltype(v)> d8;
522 // Use raw instead of BitCast to support N=1.
523 const Vec128<uint8_t, N> shifted{
524 ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
525 return shifted & Set(d8, 0xFF >> kBits);
526}
527
528template <int kBits, size_t N>
530 const DFromV<decltype(v)> di;
531 const RebindToUnsigned<decltype(di)> du;
532 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
533 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
534 return (shifted ^ shifted_sign) - shifted_sign;
535}
536
537// ------------------------------ RotateRight (ShiftRight, Or)
538template <int kBits, typename T, size_t N>
539HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
540 constexpr size_t kSizeInBits = sizeof(T) * 8;
541 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
542 if (kBits == 0) return v;
543 return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
544}
545
546// ------------------------------ Shift lanes by same variable #bits
547
548// After https://reviews.llvm.org/D108415 shift argument became unsigned.
549HWY_DIAGNOSTICS(push)
550HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
551
552// Unsigned
553template <size_t N>
554HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
555 const int bits) {
556 return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)};
557}
558template <size_t N>
560 const int bits) {
561 return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)};
562}
563template <size_t N>
565 const int bits) {
566 return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)};
567}
568template <size_t N>
570 const int bits) {
571 return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
572}
573template <size_t N>
575 const int bits) {
576 return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, bits)};
577}
578template <size_t N>
580 const int bits) {
581 return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, bits)};
582}
583
584// Signed
585template <size_t N>
587 const int bits) {
588 return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)};
589}
590template <size_t N>
592 const int bits) {
593 return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)};
594}
595template <size_t N>
597 const int bits) {
598 return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)};
599}
600template <size_t N>
602 const int bits) {
603 return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
604}
605template <size_t N>
607 const int bits) {
608 return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, bits)};
609}
610template <size_t N>
612 const int bits) {
613 return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, bits)};
614}
615
616// 8-bit
617template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
619 const DFromV<decltype(v)> d8;
620 // Use raw instead of BitCast to support N=1.
621 const Vec128<T, N> shifted{
622 ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
623 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
624}
625
626template <size_t N>
628 const int bits) {
629 const DFromV<decltype(v)> d8;
630 // Use raw instead of BitCast to support N=1.
631 const Vec128<uint8_t, N> shifted{
633 return shifted & Set(d8, 0xFF >> bits);
634}
635
636template <size_t N>
638 const DFromV<decltype(v)> di;
639 const RebindToUnsigned<decltype(di)> du;
640 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
641 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
642 return (shifted ^ shifted_sign) - shifted_sign;
643}
644
645// ignore Wsign-conversion
647
648// ------------------------------ Minimum
649
650// Unsigned
651template <size_t N>
655template <size_t N>
659template <size_t N>
663template <size_t N>
664HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
665 // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
666 const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
667 const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
668 const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
669 const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
670 alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)};
671 return Vec128<uint64_t, N>{wasm_v128_load(min)};
672}
673
674// Signed
675template <size_t N>
679template <size_t N>
683template <size_t N>
687template <size_t N>
688HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
689 alignas(16) int64_t min[4];
690 min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
691 wasm_i64x2_extract_lane(b.raw, 0));
692 min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
693 wasm_i64x2_extract_lane(b.raw, 1));
694 return Vec128<int64_t, N>{wasm_v128_load(min)};
695}
696
697// Float
698template <size_t N>
700 // Equivalent to a < b ? a : b (taking into account our swapped arg order,
701 // so that Min(NaN, x) is x to match x86).
702 return Vec128<float, N>{wasm_f32x4_pmin(b.raw, a.raw)};
703}
704
705// ------------------------------ Maximum
706
707// Unsigned
708template <size_t N>
712template <size_t N>
716template <size_t N>
720template <size_t N>
721HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
722 // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
723 const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
724 const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
725 const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
726 const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
727 alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)};
728 return Vec128<uint64_t, N>{wasm_v128_load(max)};
729}
730
731// Signed
732template <size_t N>
736template <size_t N>
740template <size_t N>
744template <size_t N>
745HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
746 alignas(16) int64_t max[2];
747 max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
748 wasm_i64x2_extract_lane(b.raw, 0));
749 max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
750 wasm_i64x2_extract_lane(b.raw, 1));
751 return Vec128<int64_t, N>{wasm_v128_load(max)};
752}
753
754// Float
755template <size_t N>
757 // Equivalent to b < a ? a : b (taking into account our swapped arg order,
758 // so that Max(NaN, x) is x to match x86).
759 return Vec128<float, N>{wasm_f32x4_pmax(b.raw, a.raw)};
760}
761
762// ------------------------------ Integer multiplication
763
764// Unsigned
765template <size_t N>
767 const Vec128<uint16_t, N> b) {
768 return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
769}
770template <size_t N>
772 const Vec128<uint32_t, N> b) {
773 return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
774}
775
776// Signed
777template <size_t N>
779 const Vec128<int16_t, N> b) {
780 return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
781}
782template <size_t N>
784 const Vec128<int32_t, N> b) {
785 return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
786}
787
788// Returns the upper 16 bits of a * b in each lane.
789template <size_t N>
790HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
791 const Vec128<uint16_t, N> b) {
792 const auto l = wasm_u32x4_extmul_low_u16x8(a.raw, b.raw);
793 const auto h = wasm_u32x4_extmul_high_u16x8(a.raw, b.raw);
794 // TODO(eustas): shift-right + narrow?
795 return Vec128<uint16_t, N>{
796 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
797}
798template <size_t N>
799HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
800 const Vec128<int16_t, N> b) {
801 const auto l = wasm_i32x4_extmul_low_i16x8(a.raw, b.raw);
802 const auto h = wasm_i32x4_extmul_high_i16x8(a.raw, b.raw);
803 // TODO(eustas): shift-right + narrow?
804 return Vec128<int16_t, N>{
805 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
806}
807
808template <size_t N>
809HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
810 Vec128<int16_t, N> b) {
811 return Vec128<int16_t, N>{wasm_i16x8_q15mulr_sat(a.raw, b.raw)};
812}
813
814// Multiplies even lanes (0, 2 ..) and returns the double-width result.
815template <size_t N>
816HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
817 const Vec128<int32_t, N> b) {
818 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
819 const auto ae = wasm_v128_and(a.raw, kEvenMask);
820 const auto be = wasm_v128_and(b.raw, kEvenMask);
821 return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
822}
823template <size_t N>
824HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
825 const Vec128<uint32_t, N> b) {
826 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
827 const auto ae = wasm_v128_and(a.raw, kEvenMask);
828 const auto be = wasm_v128_and(b.raw, kEvenMask);
829 return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
830}
831
832// ------------------------------ Negate
833
834template <typename T, size_t N, HWY_IF_FLOAT(T)>
836 return Xor(v, SignBit(DFromV<decltype(v)>()));
837}
838
839template <size_t N>
841 return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
842}
843template <size_t N>
845 return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
846}
847template <size_t N>
849 return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
850}
851template <size_t N>
853 return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
854}
855
856// ------------------------------ Floating-point mul / div
857
858template <size_t N>
862
863template <size_t N>
864HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
865 const Vec128<float, N> b) {
866 return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
867}
868
869// Approximate reciprocal
870template <size_t N>
871HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
872 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
873 return one / v;
874}
875
876// Absolute value of difference.
877template <size_t N>
878HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
879 const Vec128<float, N> b) {
880 return Abs(a - b);
881}
882
883// ------------------------------ Floating-point multiply-add variants
884
885// Returns mul * x + add
886template <size_t N>
887HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
888 const Vec128<float, N> x,
889 const Vec128<float, N> add) {
890 return mul * x + add;
891}
892
893// Returns add - mul * x
894template <size_t N>
895HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
896 const Vec128<float, N> x,
897 const Vec128<float, N> add) {
898 return add - mul * x;
899}
900
901// Returns mul * x - sub
902template <size_t N>
903HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
904 const Vec128<float, N> x,
905 const Vec128<float, N> sub) {
906 return mul * x - sub;
907}
908
909// Returns -mul * x - sub
910template <size_t N>
911HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
912 const Vec128<float, N> x,
913 const Vec128<float, N> sub) {
914 return Neg(mul) * x - sub;
915}
916
917// ------------------------------ Floating-point square root
918
919// Full precision square root
920template <size_t N>
921HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
922 return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)};
923}
924
925// Approximate reciprocal square root
926template <size_t N>
927HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
928 // TODO(eustas): find cheaper a way to calculate this.
929 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
930 return one / Sqrt(v);
931}
932
933// ------------------------------ Floating-point rounding
934
935// Toward nearest integer, ties to even
936template <size_t N>
937HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
938 return Vec128<float, N>{wasm_f32x4_nearest(v.raw)};
939}
940
941// Toward zero, aka truncate
942template <size_t N>
943HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
944 return Vec128<float, N>{wasm_f32x4_trunc(v.raw)};
945}
946
947// Toward +infinity, aka ceiling
948template <size_t N>
949HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
950 return Vec128<float, N>{wasm_f32x4_ceil(v.raw)};
951}
952
953// Toward -infinity, aka floor
954template <size_t N>
955HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
956 return Vec128<float, N>{wasm_f32x4_floor(v.raw)};
957}
958
959// ------------------------------ Floating-point classification
960template <typename T, size_t N>
961HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
962 return v != v;
963}
964
965template <typename T, size_t N, HWY_IF_FLOAT(T)>
966HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
967 const Simd<T, N, 0> d;
968 const RebindToSigned<decltype(d)> di;
969 const VFromD<decltype(di)> vi = BitCast(di, v);
970 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
971 return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
972}
973
974// Returns whether normal/subnormal/zero.
975template <typename T, size_t N, HWY_IF_FLOAT(T)>
976HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
977 const Simd<T, N, 0> d;
978 const RebindToUnsigned<decltype(d)> du;
979 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
980 const VFromD<decltype(du)> vu = BitCast(du, v);
981 // 'Shift left' to clear the sign bit, then right so we can compare with the
982 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
983 // negative and non-negative floats would be greater).
984 const VFromD<decltype(di)> exp =
985 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
986 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
987}
988
989// ================================================== COMPARE
990
991// Comparisons fill a lane with 1-bits if the condition is true, else 0.
992
993template <typename TFrom, typename TTo, size_t N>
994HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
995 Mask128<TFrom, N> m) {
996 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
997 return Mask128<TTo, N>{m.raw};
998}
999
1000template <typename T, size_t N>
1001HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
1002 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1003 return (v & bit) == bit;
1004}
1005
1006// ------------------------------ Equality
1007
1008// Unsigned
1009template <size_t N>
1011 const Vec128<uint8_t, N> b) {
1012 return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
1013}
1014template <size_t N>
1016 const Vec128<uint16_t, N> b) {
1017 return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
1018}
1019template <size_t N>
1021 const Vec128<uint32_t, N> b) {
1022 return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
1023}
1024template <size_t N>
1026 const Vec128<uint64_t, N> b) {
1027 return Mask128<uint64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
1028}
1029
1030// Signed
1031template <size_t N>
1033 const Vec128<int8_t, N> b) {
1034 return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
1035}
1036template <size_t N>
1041template <size_t N>
1043 const Vec128<int32_t, N> b) {
1044 return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
1045}
1046template <size_t N>
1048 const Vec128<int64_t, N> b) {
1049 return Mask128<int64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
1050}
1051
1052// Float
1053template <size_t N>
1055 const Vec128<float, N> b) {
1056 return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)};
1057}
1058
1059// ------------------------------ Inequality
1060
1061// Unsigned
1062template <size_t N>
1064 const Vec128<uint8_t, N> b) {
1065 return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
1066}
1067template <size_t N>
1069 const Vec128<uint16_t, N> b) {
1070 return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
1071}
1072template <size_t N>
1074 const Vec128<uint32_t, N> b) {
1075 return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
1076}
1077template <size_t N>
1079 const Vec128<uint64_t, N> b) {
1080 return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
1081}
1082
1083// Signed
1084template <size_t N>
1086 const Vec128<int8_t, N> b) {
1087 return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
1088}
1089template <size_t N>
1091 const Vec128<int16_t, N> b) {
1092 return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
1093}
1094template <size_t N>
1096 const Vec128<int32_t, N> b) {
1097 return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
1098}
1099template <size_t N>
1101 const Vec128<int64_t, N> b) {
1102 return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
1103}
1104
1105// Float
1106template <size_t N>
1108 const Vec128<float, N> b) {
1109 return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)};
1110}
1111
1112// ------------------------------ Strict inequality
1113
1114template <size_t N>
1116 const Vec128<int8_t, N> b) {
1117 return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)};
1118}
1119template <size_t N>
1121 const Vec128<int16_t, N> b) {
1122 return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)};
1123}
1124template <size_t N>
1126 const Vec128<int32_t, N> b) {
1127 return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)};
1128}
1129template <size_t N>
1131 const Vec128<int64_t, N> b) {
1132 return Mask128<int64_t, N>{wasm_i64x2_gt(a.raw, b.raw)};
1133}
1134
1135template <size_t N>
1137 const Vec128<uint8_t, N> b) {
1138 return Mask128<uint8_t, N>{wasm_u8x16_gt(a.raw, b.raw)};
1139}
1140template <size_t N>
1142 const Vec128<uint16_t, N> b) {
1143 return Mask128<uint16_t, N>{wasm_u16x8_gt(a.raw, b.raw)};
1144}
1145template <size_t N>
1147 const Vec128<uint32_t, N> b) {
1148 return Mask128<uint32_t, N>{wasm_u32x4_gt(a.raw, b.raw)};
1149}
1150template <size_t N>
1152 const Vec128<uint64_t, N> b) {
1153 const DFromV<decltype(a)> d;
1154 const Repartition<uint32_t, decltype(d)> d32;
1155 const auto a32 = BitCast(d32, a);
1156 const auto b32 = BitCast(d32, b);
1157 // If the upper halves are not equal, this is the answer.
1158 const auto m_gt = a32 > b32;
1159
1160 // Otherwise, the lower half decides.
1161 const auto m_eq = a32 == b32;
1162 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
1163 const auto lo_gt = And(m_eq, MaskFromVec(VFromD<decltype(d32)>{lo_in_hi}));
1164
1165 const auto gt = Or(lo_gt, m_gt);
1166 // Copy result in upper 32 bits to lower 32 bits.
1167 return Mask128<uint64_t, N>{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)};
1168}
1169
1170template <size_t N>
1172 const Vec128<float, N> b) {
1173 return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)};
1174}
1175
1176template <typename T, size_t N>
1177HWY_API Mask128<T, N> operator<(const Vec128<T, N> a, const Vec128<T, N> b) {
1178 return operator>(b, a);
1179}
1180
1181// ------------------------------ Weak inequality
1182
1183// Float <= >=
1184template <size_t N>
1186 const Vec128<float, N> b) {
1187 return Mask128<float, N>{wasm_f32x4_le(a.raw, b.raw)};
1188}
1189template <size_t N>
1191 const Vec128<float, N> b) {
1192 return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
1193}
1194
1195// ------------------------------ FirstN (Iota, Lt)
1196
1197template <typename T, size_t N, HWY_IF_LE128(T, N)>
1199 const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
1200 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
1201}
1202
1203// ================================================== LOGICAL
1204
1205// ------------------------------ Not
1206
1207template <typename T, size_t N>
1208HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
1209 return Vec128<T, N>{wasm_v128_not(v.raw)};
1210}
1211
1212// ------------------------------ And
1213
1214template <typename T, size_t N>
1215HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
1216 return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
1217}
1218
1219// ------------------------------ AndNot
1220
1221// Returns ~not_mask & mask.
1222template <typename T, size_t N>
1223HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
1224 return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
1225}
1226
1227// ------------------------------ Or
1228
1229template <typename T, size_t N>
1230HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
1231 return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
1232}
1233
1234// ------------------------------ Xor
1235
1236template <typename T, size_t N>
1237HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
1238 return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
1239}
1240
1241// ------------------------------ Xor3
1242
1243template <typename T, size_t N>
1244HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
1245 return Xor(x1, Xor(x2, x3));
1246}
1247
1248// ------------------------------ Or3
1249
1250template <typename T, size_t N>
1251HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
1252 return Or(o1, Or(o2, o3));
1253}
1254
1255// ------------------------------ OrAnd
1256
1257template <typename T, size_t N>
1258HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
1259 return Or(o, And(a1, a2));
1260}
1261
1262// ------------------------------ IfVecThenElse
1263
1264template <typename T, size_t N>
1265HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
1266 Vec128<T, N> no) {
1267 return IfThenElse(MaskFromVec(mask), yes, no);
1268}
1269
1270// ------------------------------ Operator overloads (internal-only if float)
1271
1272template <typename T, size_t N>
1273HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
1274 return And(a, b);
1275}
1276
1277template <typename T, size_t N>
1278HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
1279 return Or(a, b);
1280}
1281
1282template <typename T, size_t N>
1283HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
1284 return Xor(a, b);
1285}
1286
1287// ------------------------------ CopySign
1288
1289template <typename T, size_t N>
1290HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
1291 const Vec128<T, N> sign) {
1292 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1293 const auto msb = SignBit(DFromV<decltype(magn)>());
1294 return Or(AndNot(msb, magn), And(msb, sign));
1295}
1296
1297template <typename T, size_t N>
1298HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
1299 const Vec128<T, N> sign) {
1300 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1301 return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
1302}
1303
1304// ------------------------------ BroadcastSignBit (compare)
1305
1306template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
1307HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
1308 return ShiftRight<sizeof(T) * 8 - 1>(v);
1309}
1310template <size_t N>
1312 const DFromV<decltype(v)> d;
1313 return VecFromMask(d, v < Zero(d));
1314}
1315
1316// ------------------------------ Mask
1317
1318// Mask and Vec are the same (true = FF..FF).
1319template <typename T, size_t N>
1320HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1321 return Mask128<T, N>{v.raw};
1322}
1323
1324template <typename T, size_t N>
1325HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */, Mask128<T, N> v) {
1326 return Vec128<T, N>{v.raw};
1327}
1328
1329// mask ? yes : no
1330template <typename T, size_t N>
1331HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1332 Vec128<T, N> no) {
1333 return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
1334}
1335
1336// mask ? yes : 0
1337template <typename T, size_t N>
1338HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
1339 return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
1340}
1341
1342// mask ? 0 : no
1343template <typename T, size_t N>
1344HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
1345 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
1346}
1347
1348template <typename T, size_t N>
1349HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
1350 Vec128<T, N> no) {
1351 static_assert(IsSigned<T>(), "Only works for signed/float");
1352 const DFromV<decltype(v)> d;
1353 const RebindToSigned<decltype(d)> di;
1354
1355 v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
1356 return IfThenElse(MaskFromVec(v), yes, no);
1357}
1358
1359template <typename T, size_t N, HWY_IF_FLOAT(T)>
1361 const DFromV<decltype(v)> d;
1362 const auto zero = Zero(d);
1363 return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
1364}
1365
1366// ------------------------------ Mask logical
1367
1368template <typename T, size_t N>
1369HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1370 return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
1371}
1372
1373template <typename T, size_t N>
1374HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1375 const Simd<T, N, 0> d;
1376 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1377}
1378
1379template <typename T, size_t N>
1380HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1381 const Simd<T, N, 0> d;
1382 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1383}
1384
1385template <typename T, size_t N>
1386HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1387 const Simd<T, N, 0> d;
1388 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1389}
1390
1391template <typename T, size_t N>
1392HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1393 const Simd<T, N, 0> d;
1394 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1395}
1396
1397template <typename T, size_t N>
1398HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
1399 const Simd<T, N, 0> d;
1400 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
1401}
1402
1403// ------------------------------ Shl (BroadcastSignBit, IfThenElse)
1404
1405// The x86 multiply-by-Pow2() trick will not work because WASM saturates
1406// float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
1407// scalar count operand, per-lane shift instructions would require extract_lane
1408// for each lane, and hoping that shuffle is correctly mapped to a native
1409// instruction. Using non-vector shifts would incur a store-load forwarding
1410// stall when loading the result vector. We instead test bits of the shift
1411// count to "predicate" a shift of the entire vector by a constant.
1412
1413template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1415 const DFromV<decltype(v)> d;
1416 Mask128<T, N> mask;
1417 // Need a signed type for BroadcastSignBit.
1418 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1419 // Move the highest valid bit of the shift count into the sign bit.
1420 test = ShiftLeft<12>(test);
1421
1422 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1423 test = ShiftLeft<1>(test); // next bit (descending order)
1424 v = IfThenElse(mask, ShiftLeft<8>(v), v);
1425
1426 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1427 test = ShiftLeft<1>(test); // next bit (descending order)
1428 v = IfThenElse(mask, ShiftLeft<4>(v), v);
1429
1430 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1431 test = ShiftLeft<1>(test); // next bit (descending order)
1432 v = IfThenElse(mask, ShiftLeft<2>(v), v);
1433
1434 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1435 return IfThenElse(mask, ShiftLeft<1>(v), v);
1436}
1437
1438template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1439HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
1440 const DFromV<decltype(v)> d;
1441 Mask128<T, N> mask;
1442 // Need a signed type for BroadcastSignBit.
1443 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1444 // Move the highest valid bit of the shift count into the sign bit.
1445 test = ShiftLeft<27>(test);
1446
1447 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1448 test = ShiftLeft<1>(test); // next bit (descending order)
1449 v = IfThenElse(mask, ShiftLeft<16>(v), v);
1450
1451 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1452 test = ShiftLeft<1>(test); // next bit (descending order)
1453 v = IfThenElse(mask, ShiftLeft<8>(v), v);
1454
1455 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1456 test = ShiftLeft<1>(test); // next bit (descending order)
1457 v = IfThenElse(mask, ShiftLeft<4>(v), v);
1458
1459 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1460 test = ShiftLeft<1>(test); // next bit (descending order)
1461 v = IfThenElse(mask, ShiftLeft<2>(v), v);
1462
1463 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1464 return IfThenElse(mask, ShiftLeft<1>(v), v);
1465}
1466
1467template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1468HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
1469 const DFromV<decltype(v)> d;
1470 alignas(16) T lanes[2];
1471 alignas(16) T bits_lanes[2];
1472 Store(v, d, lanes);
1473 Store(bits, d, bits_lanes);
1474 lanes[0] <<= bits_lanes[0];
1475 lanes[1] <<= bits_lanes[1];
1476 return Load(d, lanes);
1477}
1478
1479// ------------------------------ Shr (BroadcastSignBit, IfThenElse)
1480
1481template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1483 const DFromV<decltype(v)> d;
1484 Mask128<T, N> mask;
1485 // Need a signed type for BroadcastSignBit.
1486 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1487 // Move the highest valid bit of the shift count into the sign bit.
1488 test = ShiftLeft<12>(test);
1489
1490 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1491 test = ShiftLeft<1>(test); // next bit (descending order)
1492 v = IfThenElse(mask, ShiftRight<8>(v), v);
1493
1494 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1495 test = ShiftLeft<1>(test); // next bit (descending order)
1496 v = IfThenElse(mask, ShiftRight<4>(v), v);
1497
1498 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1499 test = ShiftLeft<1>(test); // next bit (descending order)
1500 v = IfThenElse(mask, ShiftRight<2>(v), v);
1501
1502 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1503 return IfThenElse(mask, ShiftRight<1>(v), v);
1504}
1505
1506template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1507HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
1508 const DFromV<decltype(v)> d;
1509 Mask128<T, N> mask;
1510 // Need a signed type for BroadcastSignBit.
1511 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1512 // Move the highest valid bit of the shift count into the sign bit.
1513 test = ShiftLeft<27>(test);
1514
1515 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1516 test = ShiftLeft<1>(test); // next bit (descending order)
1517 v = IfThenElse(mask, ShiftRight<16>(v), v);
1518
1519 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1520 test = ShiftLeft<1>(test); // next bit (descending order)
1521 v = IfThenElse(mask, ShiftRight<8>(v), v);
1522
1523 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1524 test = ShiftLeft<1>(test); // next bit (descending order)
1525 v = IfThenElse(mask, ShiftRight<4>(v), v);
1526
1527 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1528 test = ShiftLeft<1>(test); // next bit (descending order)
1529 v = IfThenElse(mask, ShiftRight<2>(v), v);
1530
1531 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1532 return IfThenElse(mask, ShiftRight<1>(v), v);
1533}
1534
1535// ================================================== MEMORY
1536
1537// ------------------------------ Load
1538
1539template <typename T>
1540HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
1541 return Vec128<T>{wasm_v128_load(aligned)};
1542}
1543
1544template <typename T, size_t N>
1545HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
1546 const T* HWY_RESTRICT aligned) {
1547 return IfThenElseZero(m, Load(d, aligned));
1548}
1549
1550// Partial load.
1551template <typename T, size_t N, HWY_IF_LE64(T, N)>
1554 CopyBytes<sizeof(T) * N>(p, &v);
1555 return v;
1556}
1557
1558// LoadU == Load.
1559template <typename T, size_t N, HWY_IF_LE128(T, N)>
1561 return Load(d, p);
1562}
1563
1564// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1565template <typename T, size_t N, HWY_IF_LE128(T, N)>
1566HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
1567 return Load(d, p);
1568}
1569
1570// ------------------------------ Store
1571
1572template <typename T>
1573HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
1574 wasm_v128_store(aligned, v.raw);
1575}
1576
1577// Partial store.
1578template <typename T, size_t N, HWY_IF_LE64(T, N)>
1580 CopyBytes<sizeof(T) * N>(&v, p);
1581}
1582
1584 float* HWY_RESTRICT p) {
1585 *p = wasm_f32x4_extract_lane(v.raw, 0);
1586}
1587
1588// StoreU == Store.
1589template <typename T, size_t N>
1590HWY_API void StoreU(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT p) {
1591 Store(v, d, p);
1592}
1593
1594template <typename T, size_t N>
1595HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
1596 T* HWY_RESTRICT p) {
1597 StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
1598}
1599
1600// ------------------------------ Non-temporal stores
1601
1602// Same as aligned stores on non-x86.
1603
1604template <typename T, size_t N>
1605HWY_API void Stream(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
1606 T* HWY_RESTRICT aligned) {
1607 wasm_v128_store(aligned, v.raw);
1608}
1609
1610// ------------------------------ Scatter (Store)
1611
1612template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
1613HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
1614 T* HWY_RESTRICT base,
1615 const Vec128<Offset, N> offset) {
1616 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1617
1618 alignas(16) T lanes[N];
1619 Store(v, d, lanes);
1620
1621 alignas(16) Offset offset_lanes[N];
1622 Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
1623
1624 uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
1625 for (size_t i = 0; i < N; ++i) {
1626 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1627 }
1628}
1629
1630template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
1631HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
1632 const Vec128<Index, N> index) {
1633 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1634
1635 alignas(16) T lanes[N];
1636 Store(v, d, lanes);
1637
1638 alignas(16) Index index_lanes[N];
1639 Store(index, Rebind<Index, decltype(d)>(), index_lanes);
1640
1641 for (size_t i = 0; i < N; ++i) {
1642 base[index_lanes[i]] = lanes[i];
1643 }
1644}
1645
1646// ------------------------------ Gather (Load/Store)
1647
1648template <typename T, size_t N, typename Offset>
1649HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
1650 const T* HWY_RESTRICT base,
1651 const Vec128<Offset, N> offset) {
1652 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1653
1654 alignas(16) Offset offset_lanes[N];
1655 Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
1656
1657 alignas(16) T lanes[N];
1658 const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
1659 for (size_t i = 0; i < N; ++i) {
1660 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1661 }
1662 return Load(d, lanes);
1663}
1664
1665template <typename T, size_t N, typename Index>
1666HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
1667 const T* HWY_RESTRICT base,
1668 const Vec128<Index, N> index) {
1669 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1670
1671 alignas(16) Index index_lanes[N];
1672 Store(index, Rebind<Index, decltype(d)>(), index_lanes);
1673
1674 alignas(16) T lanes[N];
1675 for (size_t i = 0; i < N; ++i) {
1676 lanes[i] = base[index_lanes[i]];
1677 }
1678 return Load(d, lanes);
1679}
1680
1681// ================================================== SWIZZLE
1682
1683// ------------------------------ ExtractLane
1684
1685namespace detail {
1686
1687template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1689 return static_cast<T>(wasm_i8x16_extract_lane(v.raw, kLane));
1690}
1691template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1693 return static_cast<T>(wasm_i16x8_extract_lane(v.raw, kLane));
1694}
1695template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1696HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
1697 return static_cast<T>(wasm_i32x4_extract_lane(v.raw, kLane));
1698}
1699template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1700HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
1701 return static_cast<T>(wasm_i64x2_extract_lane(v.raw, kLane));
1702}
1703
1704template <size_t kLane, size_t N>
1706 return wasm_f32x4_extract_lane(v.raw, kLane);
1707}
1708
1709} // namespace detail
1710
1711// One overload per vector length just in case *_extract_lane raise compile
1712// errors if their argument is out of bounds (even if that would never be
1713// reached at runtime).
1714template <typename T>
1715HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
1716 HWY_DASSERT(i == 0);
1717 (void)i;
1718 return GetLane(v);
1719}
1720
1721template <typename T>
1722HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
1723#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1724 if (__builtin_constant_p(i)) {
1725 switch (i) {
1726 case 0:
1727 return detail::ExtractLane<0>(v);
1728 case 1:
1729 return detail::ExtractLane<1>(v);
1730 }
1731 }
1732#endif
1733 alignas(16) T lanes[2];
1734 Store(v, DFromV<decltype(v)>(), lanes);
1735 return lanes[i];
1736}
1737
1738template <typename T>
1739HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
1740#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1741 if (__builtin_constant_p(i)) {
1742 switch (i) {
1743 case 0:
1744 return detail::ExtractLane<0>(v);
1745 case 1:
1746 return detail::ExtractLane<1>(v);
1747 case 2:
1748 return detail::ExtractLane<2>(v);
1749 case 3:
1750 return detail::ExtractLane<3>(v);
1751 }
1752 }
1753#endif
1754 alignas(16) T lanes[4];
1755 Store(v, DFromV<decltype(v)>(), lanes);
1756 return lanes[i];
1757}
1758
1759template <typename T>
1760HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
1761#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1762 if (__builtin_constant_p(i)) {
1763 switch (i) {
1764 case 0:
1765 return detail::ExtractLane<0>(v);
1766 case 1:
1767 return detail::ExtractLane<1>(v);
1768 case 2:
1769 return detail::ExtractLane<2>(v);
1770 case 3:
1771 return detail::ExtractLane<3>(v);
1772 case 4:
1773 return detail::ExtractLane<4>(v);
1774 case 5:
1775 return detail::ExtractLane<5>(v);
1776 case 6:
1777 return detail::ExtractLane<6>(v);
1778 case 7:
1779 return detail::ExtractLane<7>(v);
1780 }
1781 }
1782#endif
1783 alignas(16) T lanes[8];
1784 Store(v, DFromV<decltype(v)>(), lanes);
1785 return lanes[i];
1786}
1787
1788template <typename T>
1789HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
1790#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1791 if (__builtin_constant_p(i)) {
1792 switch (i) {
1793 case 0:
1794 return detail::ExtractLane<0>(v);
1795 case 1:
1796 return detail::ExtractLane<1>(v);
1797 case 2:
1798 return detail::ExtractLane<2>(v);
1799 case 3:
1800 return detail::ExtractLane<3>(v);
1801 case 4:
1802 return detail::ExtractLane<4>(v);
1803 case 5:
1804 return detail::ExtractLane<5>(v);
1805 case 6:
1806 return detail::ExtractLane<6>(v);
1807 case 7:
1808 return detail::ExtractLane<7>(v);
1809 case 8:
1810 return detail::ExtractLane<8>(v);
1811 case 9:
1812 return detail::ExtractLane<9>(v);
1813 case 10:
1814 return detail::ExtractLane<10>(v);
1815 case 11:
1816 return detail::ExtractLane<11>(v);
1817 case 12:
1818 return detail::ExtractLane<12>(v);
1819 case 13:
1820 return detail::ExtractLane<13>(v);
1821 case 14:
1822 return detail::ExtractLane<14>(v);
1823 case 15:
1824 return detail::ExtractLane<15>(v);
1825 }
1826 }
1827#endif
1828 alignas(16) T lanes[16];
1829 Store(v, DFromV<decltype(v)>(), lanes);
1830 return lanes[i];
1831}
1832
1833// ------------------------------ GetLane
1834template <typename T, size_t N>
1835HWY_API T GetLane(const Vec128<T, N> v) {
1836 return detail::ExtractLane<0>(v);
1837}
1838
1839// ------------------------------ InsertLane
1840
1841namespace detail {
1842
1843template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1845 static_assert(kLane < N, "Lane index out of bounds");
1846 return Vec128<T, N>{
1847 wasm_i8x16_replace_lane(v.raw, kLane, static_cast<int8_t>(t))};
1848}
1849
1850template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1852 static_assert(kLane < N, "Lane index out of bounds");
1853 return Vec128<T, N>{
1854 wasm_i16x8_replace_lane(v.raw, kLane, static_cast<int16_t>(t))};
1855}
1856
1857template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1858HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
1859 static_assert(kLane < N, "Lane index out of bounds");
1860 return Vec128<T, N>{
1861 wasm_i32x4_replace_lane(v.raw, kLane, static_cast<int32_t>(t))};
1862}
1863
1864template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1865HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
1866 static_assert(kLane < N, "Lane index out of bounds");
1867 return Vec128<T, N>{
1868 wasm_i64x2_replace_lane(v.raw, kLane, static_cast<int64_t>(t))};
1869}
1870
1871template <size_t kLane, size_t N>
1873 static_assert(kLane < N, "Lane index out of bounds");
1874 return Vec128<float, N>{wasm_f32x4_replace_lane(v.raw, kLane, t)};
1875}
1876
1877template <size_t kLane, size_t N>
1879 static_assert(kLane < 2, "Lane index out of bounds");
1880 return Vec128<double, N>{wasm_f64x2_replace_lane(v.raw, kLane, t)};
1881}
1882
1883} // namespace detail
1884
1885// Requires one overload per vector length because InsertLane<3> may be a
1886// compile error if it calls wasm_f64x2_replace_lane.
1887
1888template <typename T>
1889HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
1890 HWY_DASSERT(i == 0);
1891 (void)i;
1892 return Set(DFromV<decltype(v)>(), t);
1893}
1894
1895template <typename T>
1896HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
1897#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1898 if (__builtin_constant_p(i)) {
1899 switch (i) {
1900 case 0:
1901 return detail::InsertLane<0>(v, t);
1902 case 1:
1903 return detail::InsertLane<1>(v, t);
1904 }
1905 }
1906#endif
1907 const DFromV<decltype(v)> d;
1908 alignas(16) T lanes[2];
1909 Store(v, d, lanes);
1910 lanes[i] = t;
1911 return Load(d, lanes);
1912}
1913
1914template <typename T>
1915HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
1916#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1917 if (__builtin_constant_p(i)) {
1918 switch (i) {
1919 case 0:
1920 return detail::InsertLane<0>(v, t);
1921 case 1:
1922 return detail::InsertLane<1>(v, t);
1923 case 2:
1924 return detail::InsertLane<2>(v, t);
1925 case 3:
1926 return detail::InsertLane<3>(v, t);
1927 }
1928 }
1929#endif
1930 const DFromV<decltype(v)> d;
1931 alignas(16) T lanes[4];
1932 Store(v, d, lanes);
1933 lanes[i] = t;
1934 return Load(d, lanes);
1935}
1936
1937template <typename T>
1938HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
1939#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1940 if (__builtin_constant_p(i)) {
1941 switch (i) {
1942 case 0:
1943 return detail::InsertLane<0>(v, t);
1944 case 1:
1945 return detail::InsertLane<1>(v, t);
1946 case 2:
1947 return detail::InsertLane<2>(v, t);
1948 case 3:
1949 return detail::InsertLane<3>(v, t);
1950 case 4:
1951 return detail::InsertLane<4>(v, t);
1952 case 5:
1953 return detail::InsertLane<5>(v, t);
1954 case 6:
1955 return detail::InsertLane<6>(v, t);
1956 case 7:
1957 return detail::InsertLane<7>(v, t);
1958 }
1959 }
1960#endif
1961 const DFromV<decltype(v)> d;
1962 alignas(16) T lanes[8];
1963 Store(v, d, lanes);
1964 lanes[i] = t;
1965 return Load(d, lanes);
1966}
1967
1968template <typename T>
1969HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
1970#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1971 if (__builtin_constant_p(i)) {
1972 switch (i) {
1973 case 0:
1974 return detail::InsertLane<0>(v, t);
1975 case 1:
1976 return detail::InsertLane<1>(v, t);
1977 case 2:
1978 return detail::InsertLane<2>(v, t);
1979 case 3:
1980 return detail::InsertLane<3>(v, t);
1981 case 4:
1982 return detail::InsertLane<4>(v, t);
1983 case 5:
1984 return detail::InsertLane<5>(v, t);
1985 case 6:
1986 return detail::InsertLane<6>(v, t);
1987 case 7:
1988 return detail::InsertLane<7>(v, t);
1989 case 8:
1990 return detail::InsertLane<8>(v, t);
1991 case 9:
1992 return detail::InsertLane<9>(v, t);
1993 case 10:
1994 return detail::InsertLane<10>(v, t);
1995 case 11:
1996 return detail::InsertLane<11>(v, t);
1997 case 12:
1998 return detail::InsertLane<12>(v, t);
1999 case 13:
2000 return detail::InsertLane<13>(v, t);
2001 case 14:
2002 return detail::InsertLane<14>(v, t);
2003 case 15:
2004 return detail::InsertLane<15>(v, t);
2005 }
2006 }
2007#endif
2008 const DFromV<decltype(v)> d;
2009 alignas(16) T lanes[16];
2010 Store(v, d, lanes);
2011 lanes[i] = t;
2012 return Load(d, lanes);
2013}
2014
2015// ------------------------------ LowerHalf
2016
2017template <typename T, size_t N>
2018HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
2019 Vec128<T, N> v) {
2020 return Vec128<T, N / 2>{v.raw};
2021}
2022
2023template <typename T, size_t N>
2024HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
2025 return LowerHalf(Simd<T, N / 2, 0>(), v);
2026}
2027
2028// ------------------------------ ShiftLeftBytes
2029
2030// 0x01..0F, kBytes = 1 => 0x02..0F00
2031template <int kBytes, typename T, size_t N>
2032HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
2033 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2034 const __i8x16 zero = wasm_i8x16_splat(0);
2035 switch (kBytes) {
2036 case 0:
2037 return v;
2038
2039 case 1:
2040 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
2041 6, 7, 8, 9, 10, 11, 12, 13, 14)};
2042
2043 case 2:
2044 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
2045 5, 6, 7, 8, 9, 10, 11, 12, 13)};
2046
2047 case 3:
2048 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2,
2049 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};
2050
2051 case 4:
2052 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1,
2053 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};
2054
2055 case 5:
2056 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0,
2057 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};
2058
2059 case 6:
2060 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2061 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
2062
2063 case 7:
2064 return Vec128<T, N>{wasm_i8x16_shuffle(
2065 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
2066
2067 case 8:
2068 return Vec128<T, N>{wasm_i8x16_shuffle(
2069 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
2070
2071 case 9:
2072 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2073 16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
2074 6)};
2075
2076 case 10:
2077 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2078 16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
2079 5)};
2080
2081 case 11:
2082 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2083 16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
2084 4)};
2085
2086 case 12:
2087 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2088 16, 16, 16, 16, 16, 16, 16, 0, 1,
2089 2, 3)};
2090
2091 case 13:
2092 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2093 16, 16, 16, 16, 16, 16, 16, 16, 0,
2094 1, 2)};
2095
2096 case 14:
2097 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2098 16, 16, 16, 16, 16, 16, 16, 16, 16,
2099 0, 1)};
2100
2101 case 15:
2102 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
2103 16, 16, 16, 16, 16, 16, 16, 16, 16,
2104 16, 0)};
2105 }
2106 return Vec128<T, N>{zero};
2107}
2108
2109template <int kBytes, typename T, size_t N>
2110HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
2111 return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(), v);
2112}
2113
2114// ------------------------------ ShiftLeftLanes
2115
2116template <int kLanes, typename T, size_t N>
2117HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2118 const Repartition<uint8_t, decltype(d)> d8;
2119 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2120}
2121
2122template <int kLanes, typename T, size_t N>
2123HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
2124 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
2125}
2126
2127// ------------------------------ ShiftRightBytes
2128namespace detail {
2129
2130// Helper function allows zeroing invalid lanes in caller.
2131template <int kBytes, typename T, size_t N>
2133 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2134 const __i8x16 zero = wasm_i8x16_splat(0);
2135
2136 switch (kBytes) {
2137 case 0:
2138 return v.raw;
2139
2140 case 1:
2141 return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2142 12, 13, 14, 15, 16);
2143
2144 case 2:
2145 return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2146 13, 14, 15, 16, 16);
2147
2148 case 3:
2149 return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2150 13, 14, 15, 16, 16, 16);
2151
2152 case 4:
2153 return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2154 14, 15, 16, 16, 16, 16);
2155
2156 case 5:
2157 return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
2158 15, 16, 16, 16, 16, 16);
2159
2160 case 6:
2161 return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2162 16, 16, 16, 16, 16, 16);
2163
2164 case 7:
2165 return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2166 16, 16, 16, 16, 16, 16, 16);
2167
2168 case 8:
2169 return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
2170 16, 16, 16, 16, 16, 16, 16);
2171
2172 case 9:
2173 return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
2174 16, 16, 16, 16, 16, 16, 16);
2175
2176 case 10:
2177 return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
2178 16, 16, 16, 16, 16, 16, 16);
2179
2180 case 11:
2181 return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
2182 16, 16, 16, 16, 16, 16, 16);
2183
2184 case 12:
2185 return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
2186 16, 16, 16, 16, 16, 16, 16);
2187
2188 case 13:
2189 return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
2190 16, 16, 16, 16, 16, 16, 16);
2191
2192 case 14:
2193 return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
2194 16, 16, 16, 16, 16, 16, 16);
2195
2196 case 15:
2197 return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
2198 16, 16, 16, 16, 16, 16, 16);
2199 case 16:
2200 return zero;
2201 }
2202}
2203
2204} // namespace detail
2205
2206// 0x01..0F, kBytes = 1 => 0x0001..0E
2207template <int kBytes, typename T, size_t N>
2208HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
2209 // For partial vectors, clear upper lanes so we shift in zeros.
2210 if (N != 16 / sizeof(T)) {
2211 const Vec128<T> vfull{v.raw};
2212 v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
2213 }
2214 return Vec128<T, N>{detail::ShrBytes<kBytes>(v)};
2215}
2216
2217// ------------------------------ ShiftRightLanes
2218template <int kLanes, typename T, size_t N>
2219HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2220 const Repartition<uint8_t, decltype(d)> d8;
2221 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
2222}
2223
2224// ------------------------------ UpperHalf (ShiftRightBytes)
2225
2226// Full input: copy hi into lo (smaller instruction encoding than shifts).
2227template <typename T>
2229 return Vec64<T>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
2230}
2231HWY_API Vec64<float> UpperHalf(Full64<float> /* tag */, const Vec128<float> v) {
2232 return Vec64<float>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
2233}
2234
2235// Partial
2236template <typename T, size_t N, HWY_IF_LE64(T, N)>
2237HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
2238 Vec128<T, N> v) {
2239 const DFromV<decltype(v)> d;
2240 const RebindToUnsigned<decltype(d)> du;
2241 const auto vu = BitCast(du, v);
2242 const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
2243 return Vec128<T, (N + 1) / 2>{upper.raw};
2244}
2245
2246// ------------------------------ CombineShiftRightBytes
2247
2248template <int kBytes, typename T, class V = Vec128<T>>
2250 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2251 switch (kBytes) {
2252 case 0:
2253 return lo;
2254
2255 case 1:
2256 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2257 11, 12, 13, 14, 15, 16)};
2258
2259 case 2:
2260 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2261 11, 12, 13, 14, 15, 16, 17)};
2262
2263 case 3:
2264 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2265 12, 13, 14, 15, 16, 17, 18)};
2266
2267 case 4:
2268 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2269 13, 14, 15, 16, 17, 18, 19)};
2270
2271 case 5:
2272 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2273 14, 15, 16, 17, 18, 19, 20)};
2274
2275 case 6:
2276 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
2277 14, 15, 16, 17, 18, 19, 20, 21)};
2278
2279 case 7:
2280 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
2281 15, 16, 17, 18, 19, 20, 21, 22)};
2282
2283 case 8:
2284 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
2285 16, 17, 18, 19, 20, 21, 22, 23)};
2286
2287 case 9:
2288 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
2289 17, 18, 19, 20, 21, 22, 23, 24)};
2290
2291 case 10:
2292 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
2293 17, 18, 19, 20, 21, 22, 23, 24, 25)};
2294
2295 case 11:
2296 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
2297 18, 19, 20, 21, 22, 23, 24, 25, 26)};
2298
2299 case 12:
2300 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
2301 19, 20, 21, 22, 23, 24, 25, 26, 27)};
2302
2303 case 13:
2304 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
2305 20, 21, 22, 23, 24, 25, 26, 27, 28)};
2306
2307 case 14:
2308 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
2309 21, 22, 23, 24, 25, 26, 27, 28, 29)};
2310
2311 case 15:
2312 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
2313 22, 23, 24, 25, 26, 27, 28, 29, 30)};
2314 }
2315 return hi;
2316}
2317
2318template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
2319 class V = Vec128<T, N>>
2321 constexpr size_t kSize = N * sizeof(T);
2322 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
2323 const Repartition<uint8_t, decltype(d)> d8;
2324 const Full128<uint8_t> d_full8;
2325 using V8 = VFromD<decltype(d_full8)>;
2326 const V8 hi8{BitCast(d8, hi).raw};
2327 // Move into most-significant bytes
2328 const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
2329 const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
2330 return V{BitCast(Full128<T>(), r).raw};
2331}
2332
2333// ------------------------------ Broadcast/splat any lane
2334
2335template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2337 static_assert(0 <= kLane && kLane < N, "Invalid lane");
2338 return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
2339 kLane, kLane, kLane, kLane, kLane)};
2340}
2341
2342template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2343HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
2344 static_assert(0 <= kLane && kLane < N, "Invalid lane");
2345 return Vec128<T, N>{
2346 wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
2347}
2348
2349template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2350HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
2351 static_assert(0 <= kLane && kLane < N, "Invalid lane");
2352 return Vec128<T, N>{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)};
2353}
2354
2355// ------------------------------ TableLookupBytes
2356
2357// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
2358// lane indices in [0, 16).
2359template <typename T, size_t N, typename TI, size_t NI>
2360HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
2361 const Vec128<TI, NI> from) {
2362// Not yet available in all engines, see
2363// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
2364// V8 implementation of this had a bug, fixed on 2021-04-03:
2365// https://chromium-review.googlesource.com/c/v8/v8/+/2822951
2366#if 0
2367 return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
2368#else
2369 alignas(16) uint8_t control[16];
2370 alignas(16) uint8_t input[16];
2371 alignas(16) uint8_t output[16];
2372 wasm_v128_store(control, from.raw);
2373 wasm_v128_store(input, bytes.raw);
2374 for (size_t i = 0; i < 16; ++i) {
2375 output[i] = control[i] < 16 ? input[control[i]] : 0;
2376 }
2377 return Vec128<TI, NI>{wasm_v128_load(output)};
2378#endif
2379}
2380
2381template <typename T, size_t N, typename TI, size_t NI>
2382HWY_API Vec128<TI, NI> TableLookupBytesOr0(const Vec128<T, N> bytes,
2383 const Vec128<TI, NI> from) {
2384 const Simd<TI, NI, 0> d;
2385 // Mask size must match vector type, so cast everything to this type.
2386 Repartition<int8_t, decltype(d)> di8;
2387 Repartition<int8_t, Simd<T, N, 0>> d_bytes8;
2388 const auto msb = BitCast(di8, from) < Zero(di8);
2389 const auto lookup =
2390 TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
2391 return BitCast(d, IfThenZeroElse(msb, lookup));
2392}
2393
2394// ------------------------------ Hard-coded shuffles
2395
2396// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
2397// Shuffle0321 rotates one lane to the right (the previous least-significant
2398// lane is now most-significant). These could also be implemented via
2399// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
2400
2401// Swap 32-bit halves in 64-bit halves.
2402template <typename T, size_t N>
2403HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
2404 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2405 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2406 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
2407}
2408
2409// These are used by generic_ops-inl to implement LoadInterleaved3.
2410namespace detail {
2411
2412template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2414 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2415 return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 0, 3 + 16, 2 + 16,
2416 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2417 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2418}
2419template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2421 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2422 return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 0, 3 + 8, 2 + 8,
2423 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2424}
2425template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2426HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> a, const Vec128<T, N> b) {
2427 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2428 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)};
2429}
2430
2431template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2433 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2434 return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 3, 2 + 16, 1 + 16,
2435 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2436 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2437}
2438template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2440 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2441 return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 3, 2 + 8, 1 + 8,
2442 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2443}
2444template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2445HWY_API Vec128<T, N> Shuffle1230(const Vec128<T, N> a, const Vec128<T, N> b) {
2446 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2447 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)};
2448}
2449
2450template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2452 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2453 return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 2, 1, 0 + 16, 3 + 16,
2454 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2455 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2456}
2457template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2459 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2460 return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 2, 1, 0 + 8, 3 + 8,
2461 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2462}
2463template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2464HWY_API Vec128<T, N> Shuffle3012(const Vec128<T, N> a, const Vec128<T, N> b) {
2465 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2466 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)};
2467}
2468
2469} // namespace detail
2470
2471// Swap 64-bit halves
2472template <typename T>
2473HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
2474 static_assert(sizeof(T) == 8, "Only for 64-bit lanes");
2475 return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
2476}
2477template <typename T>
2478HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
2479 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2480 return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
2481}
2482
2483// Rotate right 32 bits
2484template <typename T>
2485HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
2486 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2487 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
2488}
2489
2490// Rotate left 32 bits
2491template <typename T>
2492HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
2493 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2494 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
2495}
2496
2497// Reverse
2498template <typename T>
2499HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
2500 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2501 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
2502}
2503
2504// ------------------------------ TableLookupLanes
2505
2506// Returned by SetTableIndices for use by TableLookupLanes.
2507template <typename T, size_t N = 16 / sizeof(T)>
2508struct Indices128 {
2509 __v128_u raw;
2510};
2511
2512template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
2514 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2515#if HWY_IS_DEBUG_BUILD
2516 const Rebind<TI, decltype(d)> di;
2517 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
2518 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
2519#endif
2520
2521 const Repartition<uint8_t, decltype(d)> d8;
2522 using V8 = VFromD<decltype(d8)>;
2523 const Repartition<uint16_t, decltype(d)> d16;
2524
2525 // Broadcast each lane index to all bytes of T and shift to bytes
2526 static_assert(sizeof(T) == 4 || sizeof(T) == 8, "");
2527 if (sizeof(T) == 4) {
2528 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
2529 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
2530 const V8 lane_indices =
2531 TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
2532 const V8 byte_indices =
2533 BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
2534 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
2535 0, 1, 2, 3, 0, 1, 2, 3};
2536 return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
2537 } else {
2538 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
2539 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
2540 const V8 lane_indices =
2541 TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
2542 const V8 byte_indices =
2543 BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices)));
2544 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
2545 0, 1, 2, 3, 4, 5, 6, 7};
2546 return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
2547 }
2548}
2549
2550template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
2551HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
2552 const Rebind<TI, decltype(d)> di;
2553 return IndicesFromVec(d, LoadU(di, idx));
2554}
2555
2556template <typename T, size_t N>
2557HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
2558 using TI = MakeSigned<T>;
2559 const DFromV<decltype(v)> d;
2560 const Rebind<TI, decltype(d)> di;
2561 return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw}));
2562}
2563
2564// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
2565
2566// Single lane: no change
2567template <typename T>
2568HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
2569 return v;
2570}
2571
2572// Two lanes: shuffle
2573template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2574HWY_API Vec128<T, 2> Reverse(Simd<T, 2, 0> /* tag */, const Vec128<T, 2> v) {
2575 return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
2576}
2577
2578template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2579HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
2580 return Shuffle01(v);
2581}
2582
2583// Four lanes: shuffle
2584template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2585HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
2586 return Shuffle0123(v);
2587}
2588
2589// 16-bit
2590template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2591HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
2592 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
2593 return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
2594}
2595
2596// ------------------------------ Reverse2
2597
2598template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2599HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
2600 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
2601 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
2602}
2603
2604template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2605HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
2606 return Shuffle2301(v);
2607}
2608
2609template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2610HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
2611 return Shuffle01(v);
2612}
2613
2614// ------------------------------ Reverse4
2615
2616template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2617HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
2618 return BitCast(d, Vec128<uint16_t, N>{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2,
2619 1, 0, 7, 6, 5, 4)});
2620}
2621
2622template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2623HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
2624 return Shuffle0123(v);
2625}
2626
2627template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2628HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N>) {
2629 HWY_ASSERT(0); // don't have 8 u64 lanes
2630}
2631
2632// ------------------------------ Reverse8
2633
2634template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2635HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
2636 return Reverse(d, v);
2637}
2638
2639template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2640HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>, const Vec128<T, N>) {
2641 HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
2642}
2643
2644// ------------------------------ InterleaveLower
2645
2646template <size_t N>
2649 return Vec128<uint8_t, N>{wasm_i8x16_shuffle(
2650 a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2651}
2652template <size_t N>
2655 return Vec128<uint16_t, N>{
2656 wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2657}
2658template <size_t N>
2661 return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2662}
2663template <size_t N>
2668
2669template <size_t N>
2672 return Vec128<int8_t, N>{wasm_i8x16_shuffle(
2673 a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2674}
2675template <size_t N>
2678 return Vec128<int16_t, N>{
2679 wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2680}
2681template <size_t N>
2684 return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2685}
2686template <size_t N>
2689 return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
2690}
2691
2692template <size_t N>
2694 Vec128<float, N> b) {
2695 return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2696}
2697
2698template <size_t N>
2701 return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
2702}
2703
2704// Additional overload for the optional tag.
2705template <class V>
2706HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
2707 return InterleaveLower(a, b);
2708}
2709
2710// ------------------------------ InterleaveUpper (UpperHalf)
2711
2712// All functions inside detail lack the required D parameter.
2713namespace detail {
2714
2715template <size_t N>
2718 return Vec128<uint8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
2719 26, 11, 27, 12, 28, 13, 29, 14,
2720 30, 15, 31)};
2721}
2722template <size_t N>
2725 return Vec128<uint16_t, N>{
2726 wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2727}
2728template <size_t N>
2731 return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2732}
2733template <size_t N>
2738
2739template <size_t N>
2742 return Vec128<int8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
2743 26, 11, 27, 12, 28, 13, 29, 14,
2744 30, 15, 31)};
2745}
2746template <size_t N>
2749 return Vec128<int16_t, N>{
2750 wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2751}
2752template <size_t N>
2755 return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2756}
2757template <size_t N>
2760 return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
2761}
2762
2763template <size_t N>
2765 Vec128<float, N> b) {
2766 return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2767}
2768
2769template <size_t N>
2772 return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
2773}
2774
2775} // namespace detail
2776
2777// Full
2778template <typename T, class V = Vec128<T>>
2779HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
2780 return detail::InterleaveUpper(a, b);
2781}
2782
2783// Partial
2784template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
2785HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
2786 const Half<decltype(d)> d2;
2787 return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
2788}
2789
2790// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2791
2792// Same as Interleave*, except that the return lanes are double-width integers;
2793// this is necessary because the single-lane scalar cannot return two values.
2794template <class V, class DW = RepartitionToWide<DFromV<V>>>
2795HWY_API VFromD<DW> ZipLower(V a, V b) {
2796 return BitCast(DW(), InterleaveLower(a, b));
2797}
2798template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2799HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2800 return BitCast(dw, InterleaveLower(D(), a, b));
2801}
2802
2803template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2804HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2805 return BitCast(dw, InterleaveUpper(D(), a, b));
2806}
2807
2808// ================================================== COMBINE
2809
2810// ------------------------------ Combine (InterleaveLower)
2811
2812// N = N/2 + N/2 (upper half undefined)
2813template <typename T, size_t N, HWY_IF_LE128(T, N)>
2814HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
2815 Vec128<T, N / 2> lo_half) {
2816 const Half<decltype(d)> d2;
2817 const RebindToUnsigned<decltype(d2)> du2;
2818 // Treat half-width input as one lane, and expand to two lanes.
2819 using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
2820 const VU lo{BitCast(du2, lo_half).raw};
2821 const VU hi{BitCast(du2, hi_half).raw};
2822 return BitCast(d, InterleaveLower(lo, hi));
2823}
2824
2825// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
2826
2827template <typename T, size_t N, HWY_IF_LE128(T, N)>
2831
2832// ------------------------------ ConcatLowerLower
2833
2834// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2835template <typename T>
2837 const Vec128<T> lo) {
2838 return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
2839}
2840template <typename T, size_t N, HWY_IF_LE64(T, N)>
2841HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
2842 const Vec128<T, N> lo) {
2843 const Half<decltype(d)> d2;
2844 return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
2845}
2846
2847// ------------------------------ ConcatUpperUpper
2848
2849template <typename T>
2851 const Vec128<T> lo) {
2852 return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
2853}
2854template <typename T, size_t N, HWY_IF_LE64(T, N)>
2855HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
2856 const Vec128<T, N> lo) {
2857 const Half<decltype(d)> d2;
2858 return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
2859}
2860
2861// ------------------------------ ConcatLowerUpper
2862
2863template <typename T>
2865 const Vec128<T> lo) {
2866 return CombineShiftRightBytes<8>(d, hi, lo);
2867}
2868template <typename T, size_t N, HWY_IF_LE64(T, N)>
2869HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
2870 const Vec128<T, N> lo) {
2871 const Half<decltype(d)> d2;
2872 return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
2873}
2874
2875// ------------------------------ ConcatUpperLower
2876template <typename T, size_t N>
2877HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
2878 const Vec128<T, N> lo) {
2879 return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
2880}
2881
2882// ------------------------------ ConcatOdd
2883
2884// 8-bit full
2885template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2887 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15,
2888 17, 19, 21, 23, 25, 27, 29, 31)};
2889}
2890
2891// 8-bit x8
2892template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2894 Vec128<T, 8> lo) {
2895 // Don't care about upper half.
2896 return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21,
2897 23, 1, 3, 5, 7, 17, 19, 21, 23)};
2898}
2899
2900// 8-bit x4
2901template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2902HWY_API Vec128<T, 4> ConcatOdd(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
2903 Vec128<T, 4> lo) {
2904 // Don't care about upper 3/4.
2905 return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17,
2906 19, 1, 3, 17, 19, 1, 3, 17, 19)};
2907}
2908
2909// 16-bit full
2910template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2911HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2912 return Vec128<T>{
2913 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)};
2914}
2915
2916// 16-bit x4
2917template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2918HWY_API Vec128<T, 4> ConcatOdd(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
2919 Vec128<T, 4> lo) {
2920 // Don't care about upper half.
2921 return Vec128<T, 4>{
2922 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)};
2923}
2924
2925// 32-bit full
2926template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2927HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2928 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
2929}
2930
2931// Any T x2
2932template <typename T>
2933HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2, 0> d, Vec128<T, 2> hi,
2934 Vec128<T, 2> lo) {
2935 return InterleaveUpper(d, lo, hi);
2936}
2937
2938// ------------------------------ ConcatEven (InterleaveLower)
2939
2940// 8-bit full
2941template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2943 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14,
2944 16, 18, 20, 22, 24, 26, 28, 30)};
2945}
2946
2947// 8-bit x8
2948template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2950 Vec128<T, 8> lo) {
2951 // Don't care about upper half.
2952 return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20,
2953 22, 0, 2, 4, 6, 16, 18, 20, 22)};
2954}
2955
2956// 8-bit x4
2957template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2958HWY_API Vec128<T, 4> ConcatEven(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
2959 Vec128<T, 4> lo) {
2960 // Don't care about upper 3/4.
2961 return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16,
2962 18, 0, 2, 16, 18, 0, 2, 16, 18)};
2963}
2964
2965// 16-bit full
2966template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2967HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2968 return Vec128<T>{
2969 wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)};
2970}
2971
2972// 16-bit x4
2973template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2974HWY_API Vec128<T, 4> ConcatEven(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
2975 Vec128<T, 4> lo) {
2976 // Don't care about upper half.
2977 return Vec128<T, 4>{
2978 wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)};
2979}
2980
2981// 32-bit full
2982template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2983HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2984 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
2985}
2986
2987// Any T x2
2988template <typename T>
2989HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2, 0> d, Vec128<T, 2> hi,
2990 Vec128<T, 2> lo) {
2991 return InterleaveLower(d, lo, hi);
2992}
2993
2994// ------------------------------ DupEven (InterleaveLower)
2995
2996template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2997HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
2998 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)};
2999}
3000
3001template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3002HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
3003 return InterleaveLower(DFromV<decltype(v)>(), v, v);
3004}
3005
3006// ------------------------------ DupOdd (InterleaveUpper)
3007
3008template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3009HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
3010 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)};
3011}
3012
3013template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3014HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
3015 return InterleaveUpper(DFromV<decltype(v)>(), v, v);
3016}
3017
3018// ------------------------------ OddEven
3019
3020namespace detail {
3021
3022template <typename T, size_t N>
3024 const Vec128<T, N> b) {
3025 const DFromV<decltype(a)> d;
3026 const Repartition<uint8_t, decltype(d)> d8;
3027 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3028 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3029 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
3030}
3031template <typename T, size_t N>
3033 const Vec128<T, N> b) {
3034 return Vec128<T, N>{
3035 wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
3036}
3037template <typename T, size_t N>
3039 const Vec128<T, N> b) {
3040 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
3041}
3042template <typename T, size_t N>
3044 const Vec128<T, N> b) {
3045 return Vec128<T, N>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
3046}
3047
3048} // namespace detail
3049
3050template <typename T, size_t N>
3051HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
3052 return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
3053}
3054template <size_t N>
3056 const Vec128<float, N> b) {
3057 return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
3058}
3059
3060// ------------------------------ OddEvenBlocks
3061template <typename T, size_t N>
3062HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
3063 return even;
3064}
3065
3066// ------------------------------ SwapAdjacentBlocks
3067
3068template <typename T, size_t N>
3069HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
3070 return v;
3071}
3072
3073// ------------------------------ ReverseBlocks
3074
3075// Single block: no change
3076template <typename T>
3077HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
3078 return v;
3079}
3080
3081// ================================================== CONVERT
3082
3083// ------------------------------ Promotions (part w/ narrow lanes -> full)
3084
3085// Unsigned: zero-extend.
3086template <size_t N, HWY_IF_LE128(uint16_t, N)>
3087HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N, 0> /* tag */,
3088 const Vec128<uint8_t, N> v) {
3089 return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
3090}
3091template <size_t N, HWY_IF_LE128(uint32_t, N)>
3092HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
3093 const Vec128<uint8_t, N> v) {
3094 return Vec128<uint32_t, N>{
3095 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
3096}
3097template <size_t N, HWY_IF_LE128(int16_t, N)>
3098HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
3099 const Vec128<uint8_t, N> v) {
3100 return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
3101}
3102template <size_t N, HWY_IF_LE128(int32_t, N)>
3103HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
3104 const Vec128<uint8_t, N> v) {
3105 return Vec128<int32_t, N>{
3106 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
3107}
3108template <size_t N, HWY_IF_LE128(uint32_t, N)>
3110 const Vec128<uint16_t, N> v) {
3111 return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
3112}
3113template <size_t N, HWY_IF_LE128(uint64_t, N)>
3114HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N, 0> /* tag */,
3115 const Vec128<uint32_t, N> v) {
3116 return Vec128<uint64_t, N>{wasm_u64x2_extend_low_u32x4(v.raw)};
3117}
3118
3119template <size_t N, HWY_IF_LE128(int32_t, N)>
3120HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
3121 const Vec128<uint16_t, N> v) {
3122 return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
3123}
3124
3125// Signed: replicate sign bit.
3126template <size_t N, HWY_IF_LE128(int16_t, N)>
3128 const Vec128<int8_t, N> v) {
3129 return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
3130}
3131template <size_t N, HWY_IF_LE128(int32_t, N)>
3133 const Vec128<int8_t, N> v) {
3134 return Vec128<int32_t, N>{
3135 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
3136}
3137template <size_t N, HWY_IF_LE128(int32_t, N)>
3139 const Vec128<int16_t, N> v) {
3140 return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
3141}
3142template <size_t N, HWY_IF_LE128(int64_t, N)>
3144 const Vec128<int32_t, N> v) {
3145 return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(v.raw)};
3146}
3147
3148template <size_t N, HWY_IF_LE128(double, N)>
3150 const Vec128<int32_t, N> v) {
3151 return Vec128<double, N>{wasm_f64x2_convert_low_i32x4(v.raw)};
3152}
3153
3154template <size_t N, HWY_IF_LE128(float, N)>
3156 const Vec128<float16_t, N> v) {
3157 const RebindToSigned<decltype(df32)> di32;
3158 const RebindToUnsigned<decltype(df32)> du32;
3159 // Expand to u32 so we can shift.
3160 const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
3161 const auto sign = ShiftRight<15>(bits16);
3162 const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
3163 const auto mantissa = bits16 & Set(du32, 0x3FF);
3164 const auto subnormal =
3165 BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
3166 Set(df32, 1.0f / 16384 / 1024));
3167
3168 const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
3169 const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
3170 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3171 const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
3172 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3173}
3174
3175template <size_t N, HWY_IF_LE128(float, N)>
3177 const Vec128<bfloat16_t, N> v) {
3178 const Rebind<uint16_t, decltype(df32)> du16;
3179 const RebindToSigned<decltype(df32)> di32;
3180 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
3181}
3182
3183// ------------------------------ Demotions (full -> part w/ narrow lanes)
3184
3185template <size_t N>
3187 const Vec128<int32_t, N> v) {
3188 return Vec128<uint16_t, N>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
3189}
3190
3191template <size_t N>
3193 const Vec128<int32_t, N> v) {
3194 return Vec128<int16_t, N>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
3195}
3196
3197template <size_t N>
3199 const Vec128<int32_t, N> v) {
3200 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
3201 return Vec128<uint8_t, N>{
3202 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
3203}
3204
3205template <size_t N>
3207 const Vec128<int16_t, N> v) {
3208 return Vec128<uint8_t, N>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
3209}
3210
3211template <size_t N>
3213 const Vec128<int32_t, N> v) {
3214 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
3215 return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
3216}
3217
3218template <size_t N>
3220 const Vec128<int16_t, N> v) {
3221 return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
3222}
3223
3224template <size_t N>
3225HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N, 0> /* di */,
3226 const Vec128<double, N> v) {
3227 return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
3228}
3229
3230template <size_t N>
3231HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
3232 const Vec128<float, N> v) {
3233 const RebindToUnsigned<decltype(df16)> du16;
3234 const Rebind<uint32_t, decltype(du16)> du;
3235 const RebindToSigned<decltype(du)> di;
3236 const auto bits32 = BitCast(du, v);
3237 const auto sign = ShiftRight<31>(bits32);
3238 const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
3239 const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
3240
3241 const auto k15 = Set(di, 15);
3242 const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
3243 const auto is_tiny = exp < Set(di, -24);
3244
3245 const auto is_subnormal = exp < Set(di, -14);
3246 const auto biased_exp16 =
3247 BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
3248 const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
3249 const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
3250 (mantissa32 >> (Set(du, 13) + sub_exp));
3251 const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
3252 ShiftRight<13>(mantissa32)); // <1024
3253
3254 const auto sign16 = ShiftLeft<15>(sign);
3255 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3256 const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
3257 return Vec128<float16_t, N>{DemoteTo(du16, bits16).raw};
3258}
3259
3260template <size_t N>
3261HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
3262 const Vec128<float, N> v) {
3263 const Rebind<int32_t, decltype(dbf16)> di32;
3264 const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
3265 const Rebind<uint16_t, decltype(dbf16)> du16;
3266 const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
3267 return BitCast(dbf16, DemoteTo(du16, bits_in_32));
3268}
3269
3270template <size_t N>
3271HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
3272 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
3273 const RebindToUnsigned<decltype(dbf16)> du16;
3274 const Repartition<uint32_t, decltype(dbf16)> du32;
3275 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
3276 const auto u16 = OddEven(BitCast(du16, a), BitCast(du16, b_in_even));
3277 return BitCast(dbf16, u16);
3278}
3279
3280// Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes
3281// above 2*N.
3285 const Half<decltype(dn)> dnh;
3286 // Pretend the result has twice as many lanes so we can InterleaveLower.
3287 const Vec128<int16_t, 2> an{DemoteTo(dnh, a).raw};
3288 const Vec128<int16_t, 2> bn{DemoteTo(dnh, b).raw};
3289 return InterleaveLower(an, bn);
3290}
3294 const Half<decltype(dn)> dnh;
3295 // Pretend the result has twice as many lanes so we can InterleaveLower.
3296 const Vec128<int16_t, 4> an{DemoteTo(dnh, a).raw};
3297 const Vec128<int16_t, 4> bn{DemoteTo(dnh, b).raw};
3298 return InterleaveLower(an, bn);
3299}
3300HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> /*d16*/,
3301 Vec128<int32_t> a, Vec128<int32_t> b) {
3302 return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(a.raw, b.raw)};
3303}
3304
3305// For already range-limited input [0, 255].
3306template <size_t N>
3307HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
3308 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
3309 return Vec128<uint8_t, N>{
3310 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
3311}
3312
3313// ------------------------------ Truncations
3314
3315template <typename From, typename To, HWY_IF_UNSIGNED(From),
3316 HWY_IF_UNSIGNED(To),
3317 hwy::EnableIf<(sizeof(To) < sizeof(From))>* = nullptr>
3318HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
3319 const Vec128<From, 1> v) {
3320 const Repartition<To, DFromV<decltype(v)>> d;
3321 const auto v1 = BitCast(d, v);
3322 return Vec128<To, 1>{v1.raw};
3323}
3324
3326 const Vec128<uint64_t> v) {
3327 const Full128<uint8_t> d;
3328 const auto v1 = BitCast(d, v);
3329 const auto v2 = ConcatEven(d, v1, v1);
3330 const auto v4 = ConcatEven(d, v2, v2);
3331 return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4))));
3332}
3333
3335 const Vec128<uint64_t> v) {
3336 const Full128<uint16_t> d;
3337 const auto v1 = BitCast(d, v);
3338 const auto v2 = ConcatEven(d, v1, v1);
3339 return LowerHalf(LowerHalf(ConcatEven(d, v2, v2)));
3340}
3341
3343 const Vec128<uint64_t> v) {
3344 const Full128<uint32_t> d;
3345 const auto v1 = BitCast(d, v);
3346 return LowerHalf(ConcatEven(d, v1, v1));
3347}
3348
3349template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
3350HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
3351 const Vec128<uint32_t, N> v) {
3352 const Full128<uint8_t> d;
3353 const auto v1 = Vec128<uint8_t>{v.raw};
3354 const auto v2 = ConcatEven(d, v1, v1);
3355 const auto v3 = ConcatEven(d, v2, v2);
3356 return Vec128<uint8_t, N>{v3.raw};
3357}
3358
3359template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
3360HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
3361 const Vec128<uint32_t, N> v) {
3362 const Full128<uint16_t> d;
3363 const auto v1 = Vec128<uint16_t>{v.raw};
3364 const auto v2 = ConcatEven(d, v1, v1);
3365 return Vec128<uint16_t, N>{v2.raw};
3366}
3367
3368template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
3369HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
3370 const Vec128<uint16_t, N> v) {
3371 const Full128<uint8_t> d;
3372 const auto v1 = Vec128<uint8_t>{v.raw};
3373 const auto v2 = ConcatEven(d, v1, v1);
3374 return Vec128<uint8_t, N>{v2.raw};
3375}
3376
3377// ------------------------------ Convert i32 <=> f32 (Round)
3378
3379template <size_t N>
3381 const Vec128<int32_t, N> v) {
3382 return Vec128<float, N>{wasm_f32x4_convert_i32x4(v.raw)};
3383}
3384template <size_t N>
3386 const Vec128<uint32_t, N> v) {
3387 return Vec128<float, N>{wasm_f32x4_convert_u32x4(v.raw)};
3388}
3389// Truncates (rounds toward zero).
3390template <size_t N>
3392 const Vec128<float, N> v) {
3393 return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
3394}
3395
3396template <size_t N>
3397HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
3398 return ConvertTo(Simd<int32_t, N, 0>(), Round(v));
3399}
3400
3401// ================================================== MISC
3402
3403// ------------------------------ SumsOf8 (ShiftRight, Add)
3404template <size_t N>
3406 const DFromV<decltype(v)> du8;
3407 const RepartitionToWide<decltype(du8)> du16;
3408 const RepartitionToWide<decltype(du16)> du32;
3409 const RepartitionToWide<decltype(du32)> du64;
3410 using VU16 = VFromD<decltype(du16)>;
3411
3412 const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
3413 const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF));
3414 const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
3415
3416 const VU16 szz_FE_zz_BA_zz_76_zz_32 =
3417 BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
3418 const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
3419 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
3420 const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
3421 BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
3422 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
3423 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
3424 return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF));
3425}
3426
3427// ------------------------------ LoadMaskBits (TestBit)
3428
3429namespace detail {
3430
3431template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
3432HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
3433 const RebindToUnsigned<decltype(d)> du;
3434 // Easier than Set(), which would require an >8-bit type, which would not
3435 // compile for T=uint8_t, N=1.
3436 const Vec128<T, N> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};
3437
3438 // Replicate bytes 8x such that each byte contains the bit that governs it.
3439 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
3440 1, 1, 1, 1, 1, 1, 1, 1};
3441 const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
3442
3443 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
3444 1, 2, 4, 8, 16, 32, 64, 128};
3445 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
3446}
3447
3448template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3449HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
3450 const RebindToUnsigned<decltype(d)> du;
3451 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
3452 return RebindMask(
3453 d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit)));
3454}
3455
3456template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3457HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
3458 const RebindToUnsigned<decltype(d)> du;
3459 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
3460 return RebindMask(
3461 d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit)));
3462}
3463
3464template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3465HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
3466 const RebindToUnsigned<decltype(d)> du;
3467 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
3468 return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
3469}
3470
3471} // namespace detail
3472
3473// `p` points to at least 8 readable bytes, not all of which need be valid.
3474template <typename T, size_t N, HWY_IF_LE128(T, N)>
3475HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
3476 const uint8_t* HWY_RESTRICT bits) {
3477 uint64_t mask_bits = 0;
3478 CopyBytes<(N + 7) / 8>(bits, &mask_bits);
3479 return detail::LoadMaskBits(d, mask_bits);
3480}
3481
3482// ------------------------------ Mask
3483
3484namespace detail {
3485
3486// Full
3487template <typename T>
3488HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
3489 const Mask128<T> mask) {
3490 alignas(16) uint64_t lanes[2];
3491 wasm_v128_store(lanes, mask.raw);
3492
3493 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3494 const uint64_t lo = ((lanes[0] * kMagic) >> 56);
3495 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
3496 return (hi + lo);
3497}
3498
3499// 64-bit
3500template <typename T>
3502 const Mask128<T, 8> mask) {
3503 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3504 return (static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)) *
3505 kMagic) >>
3506 56;
3507}
3508
3509// 32-bit or less: need masking
3510template <typename T, size_t N, HWY_IF_LE32(T, N)>
3511HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
3512 const Mask128<T, N> mask) {
3513 uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
3514 // Clear potentially undefined bytes.
3515 bytes &= (1ULL << (N * 8)) - 1;
3516 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3517 return (bytes * kMagic) >> 56;
3518}
3519
3520template <typename T, size_t N>
3522 const Mask128<T, N> mask) {
3523 // Remove useless lower half of each u16 while preserving the sign bit.
3524 const __i16x8 zero = wasm_i16x8_splat(0);
3525 const Mask128<uint8_t, N> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
3526 return BitsFromMask(hwy::SizeTag<1>(), mask8);
3527}
3528
3529template <typename T, size_t N>
3531 const Mask128<T, N> mask) {
3532 const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
3533 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
3534 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
3535 alignas(16) uint32_t lanes[4];
3536 wasm_v128_store(lanes, sliced_mask);
3537 return lanes[0] | lanes[1] | lanes[2] | lanes[3];
3538}
3539
3540template <typename T, size_t N>
3542 const Mask128<T, N> mask) {
3543 const __i64x2 mask_i = static_cast<__i64x2>(mask.raw);
3544 const __i64x2 slice = wasm_i64x2_make(1, 2);
3545 const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
3546 alignas(16) uint64_t lanes[2];
3547 wasm_v128_store(lanes, sliced_mask);
3548 return lanes[0] | lanes[1];
3549}
3550
3551// Returns the lowest N bits for the BitsFromMask result.
3552template <typename T, size_t N>
3553constexpr uint64_t OnlyActive(uint64_t bits) {
3554 return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
3555}
3556
3557// Returns 0xFF for bytes with index >= N, otherwise 0.
3558template <size_t N>
3559constexpr __i8x16 BytesAbove() {
3560 return
3561 (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
3562 : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
3563 : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
3564 : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
3565 : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
3566 : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
3567 : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
3568 : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
3569 : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
3570 : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3571 -1, -1, -1, -1, -1)
3572 : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3573 -1, -1, -1, -1)
3574 : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
3575 -1, -1, -1, -1)
3576 : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
3577 -1, -1, -1)
3578 : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
3579 -1, -1, -1)
3580 : (N == 11)
3581 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
3582 : (N == 13)
3583 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
3584 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
3585}
3586
3587template <typename T, size_t N>
3588HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
3589 return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
3590}
3591
3592template <typename T>
3593HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
3594 return PopCount(BitsFromMask(tag, m));
3595}
3596
3597template <typename T>
3598HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
3599 return PopCount(BitsFromMask(tag, m));
3600}
3601
3602template <typename T>
3603HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
3604 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
3605 const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
3606 alignas(16) uint64_t lanes[2];
3607 wasm_v128_store(lanes, shifted_bits);
3608 return PopCount(lanes[0] | lanes[1]);
3609}
3610
3611template <typename T>
3612HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
3613 alignas(16) int64_t lanes[2];
3614 wasm_v128_store(lanes, m.raw);
3615 return static_cast<size_t>(-(lanes[0] + lanes[1]));
3616}
3617
3618} // namespace detail
3619
3620// `p` points to at least 8 writable bytes.
3621template <typename T, size_t N>
3622HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
3623 const Mask128<T, N> mask, uint8_t* bits) {
3624 const uint64_t mask_bits = detail::BitsFromMask(mask);
3625 const size_t kNumBytes = (N + 7) / 8;
3626 CopyBytes<kNumBytes>(&mask_bits, bits);
3627 return kNumBytes;
3628}
3629
3630template <typename T, size_t N>
3631HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
3632 return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), m);
3633}
3634
3635// Partial vector
3636template <typename T, size_t N, HWY_IF_LE64(T, N)>
3637HWY_API size_t CountTrue(const Simd<T, N, 0> d, const Mask128<T, N> m) {
3638 // Ensure all undefined bytes are 0.
3639 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3640 return CountTrue(d, Mask128<T>{AndNot(mask, m).raw});
3641}
3642
3643// Full vector
3644template <typename T>
3646#if 0
3647 // Casting followed by wasm_i8x16_any_true results in wasm error:
3648 // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
3649 const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(d, m));
3650 return !wasm_i8x16_any_true(v8.raw);
3651#else
3652 (void)d;
3653 return (wasm_i64x2_extract_lane(m.raw, 0) |
3654 wasm_i64x2_extract_lane(m.raw, 1)) == 0;
3655#endif
3656}
3657
3658// Full vector
3659namespace detail {
3660template <typename T>
3662 return wasm_i8x16_all_true(m.raw);
3663}
3664template <typename T>
3666 return wasm_i16x8_all_true(m.raw);
3667}
3668template <typename T>
3670 return wasm_i32x4_all_true(m.raw);
3671}
3672template <typename T>
3674 return wasm_i64x2_all_true(m.raw);
3675}
3676
3677} // namespace detail
3678
3679template <typename T, size_t N>
3680HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
3681 return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
3682}
3683
3684// Partial vectors
3685
3686template <typename T, size_t N, HWY_IF_LE64(T, N)>
3688 // Ensure all undefined bytes are 0.
3689 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3690 return AllFalse(Full128<T>(), Mask128<T>{AndNot(mask, m).raw});
3691}
3692
3693template <typename T, size_t N, HWY_IF_LE64(T, N)>
3694HWY_API bool AllTrue(const Simd<T, N, 0> /* d */, const Mask128<T, N> m) {
3695 // Ensure all undefined bytes are FF.
3696 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3697 return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw});
3698}
3699
3700template <typename T, size_t N>
3701HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
3702 const Mask128<T, N> mask) {
3703 const uint64_t bits = detail::BitsFromMask(mask);
3704 return Num0BitsBelowLS1Bit_Nonzero64(bits);
3705}
3706
3707template <typename T, size_t N>
3708HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
3709 const Mask128<T, N> mask) {
3710 const uint64_t bits = detail::BitsFromMask(mask);
3711 return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1;
3712}
3713
3714// ------------------------------ Compress
3715
3716namespace detail {
3717
3718template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3719HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
3720 HWY_DASSERT(mask_bits < 256);
3721 const Simd<T, N, 0> d;
3722 const Rebind<uint8_t, decltype(d)> d8;
3723 const Simd<uint16_t, N, 0> du;
3724
3725 // We need byte indices for TableLookupBytes (one vector's worth for each of
3726 // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
3727 // can instead store lane indices and convert to byte indices (2*lane + 0..1),
3728 // with the doubling baked into the table. Unpacking nibbles is likely more
3729 // costly than the higher cache footprint from storing bytes.
3730 alignas(16) constexpr uint8_t table[256 * 8] = {
3731 // PrintCompress16x8Tables
3732 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3733 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3734 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
3735 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3736 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
3737 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
3738 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
3739 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3740 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
3741 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
3742 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
3743 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
3744 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
3745 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
3746 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
3747 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3748 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
3749 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
3750 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
3751 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
3752 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
3753 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
3754 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
3755 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
3756 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
3757 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
3758 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
3759 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
3760 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
3761 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
3762 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
3763 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3764 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
3765 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
3766 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
3767 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
3768 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
3769 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
3770 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
3771 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
3772 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
3773 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
3774 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
3775 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
3776 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
3777 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
3778 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
3779 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
3780 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
3781 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
3782 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
3783 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
3784 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
3785 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
3786 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
3787 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
3788 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
3789 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
3790 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
3791 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
3792 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
3793 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
3794 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
3795 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3796 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
3797 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
3798 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
3799 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
3800 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
3801 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
3802 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
3803 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
3804 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
3805 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
3806 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
3807 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
3808 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
3809 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
3810 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
3811 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
3812 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
3813 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
3814 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
3815 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
3816 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
3817 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
3818 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
3819 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
3820 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
3821 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
3822 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
3823 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
3824 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
3825 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
3826 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
3827 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
3828 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
3829 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
3830 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
3831 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
3832 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
3833 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
3834 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
3835 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
3836 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
3837 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
3838 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
3839 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
3840 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
3841 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
3842 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
3843 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
3844 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
3845 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
3846 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
3847 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
3848 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
3849 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
3850 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
3851 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
3852 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
3853 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
3854 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
3855 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
3856 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
3857 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
3858 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
3859 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
3860
3861 const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
3862 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
3863 return BitCast(d, pairs + Set(du, 0x0100));
3864}
3865
3866template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3867HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
3868 HWY_DASSERT(mask_bits < 256);
3869 const Simd<T, N, 0> d;
3870 const Rebind<uint8_t, decltype(d)> d8;
3871 const Simd<uint16_t, N, 0> du;
3872
3873 // We need byte indices for TableLookupBytes (one vector's worth for each of
3874 // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
3875 // can instead store lane indices and convert to byte indices (2*lane + 0..1),
3876 // with the doubling baked into the table. Unpacking nibbles is likely more
3877 // costly than the higher cache footprint from storing bytes.
3878 alignas(16) constexpr uint8_t table[256 * 8] = {
3879 // PrintCompressNot16x8Tables
3880 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, //
3881 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, //
3882 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, //
3883 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, //
3884 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, //
3885 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, //
3886 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, //
3887 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, //
3888 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, //
3889 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, //
3890 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, //
3891 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, //
3892 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, //
3893 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, //
3894 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, //
3895 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, //
3896 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, //
3897 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, //
3898 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, //
3899 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, //
3900 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, //
3901 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, //
3902 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, //
3903 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, //
3904 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, //
3905 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, //
3906 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, //
3907 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, //
3908 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, //
3909 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, //
3910 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, //
3911 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, //
3912 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, //
3913 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, //
3914 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, //
3915 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, //
3916 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, //
3917 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, //
3918 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, //
3919 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, //
3920 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, //
3921 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, //
3922 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, //
3923 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, //
3924 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, //
3925 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, //
3926 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, //
3927 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, //
3928 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, //
3929 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, //
3930 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, //
3931 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, //
3932 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, //
3933 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, //
3934 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, //
3935 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, //
3936 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, //
3937 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, //
3938 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, //
3939 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, //
3940 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, //
3941 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, //
3942 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, //
3943 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, //
3944 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, //
3945 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, //
3946 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, //
3947 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, //
3948 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, //
3949 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, //
3950 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, //
3951 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, //
3952 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, //
3953 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, //
3954 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, //
3955 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, //
3956 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, //
3957 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, //
3958 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, //
3959 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, //
3960 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, //
3961 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, //
3962 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, //
3963 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, //
3964 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, //
3965 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, //
3966 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, //
3967 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, //
3968 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, //
3969 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, //
3970 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, //
3971 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, //
3972 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, //
3973 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, //
3974 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, //
3975 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, //
3976 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, //
3977 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, //
3978 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, //
3979 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, //
3980 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, //
3981 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, //
3982 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, //
3983 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, //
3984 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, //
3985 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, //
3986 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, //
3987 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, //
3988 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, //
3989 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, //
3990 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, //
3991 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, //
3992 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, //
3993 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, //
3994 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, //
3995 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, //
3996 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, //
3997 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, //
3998 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, //
3999 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, //
4000 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, //
4001 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, //
4002 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, //
4003 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, //
4004 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, //
4005 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, //
4006 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, //
4007 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
4008
4009 const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
4010 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
4011 return BitCast(d, pairs + Set(du, 0x0100));
4012}
4013
4014template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4015HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
4016 HWY_DASSERT(mask_bits < 16);
4017
4018 // There are only 4 lanes, so we can afford to load the index vector directly.
4019 alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
4020 // PrintCompress32x4Tables
4021 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
4022 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
4023 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
4024 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
4025 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
4026 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
4027 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
4028 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
4029 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
4030 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
4031 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
4032 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
4033 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
4034 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
4035 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
4036 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4037 const Simd<T, N, 0> d;
4038 const Repartition<uint8_t, decltype(d)> d8;
4039 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
4040}
4041
4042template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4043HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
4044 HWY_DASSERT(mask_bits < 16);
4045
4046 // There are only 4 lanes, so we can afford to load the index vector directly.
4047 alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
4048 // PrintCompressNot32x4Tables
4049 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
4050 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
4051 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
4052 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
4053 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
4054 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
4055 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
4056 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4057 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
4058 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
4059 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
4060 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
4061 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
4062 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
4063 12, 13, 14, 15};
4064 const Simd<T, N, 0> d;
4065 const Repartition<uint8_t, decltype(d)> d8;
4066 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
4067}
4068
4069template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4070HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
4071 HWY_DASSERT(mask_bits < 4);
4072
4073 // There are only 2 lanes, so we can afford to load the index vector directly.
4074 alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
4075 // PrintCompress64x2Tables
4076 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4077 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4078 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
4079 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4080
4081 const Simd<T, N, 0> d;
4082 const Repartition<uint8_t, decltype(d)> d8;
4083 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
4084}
4085
4086template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4087HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
4088 HWY_DASSERT(mask_bits < 4);
4089
4090 // There are only 2 lanes, so we can afford to load the index vector directly.
4091 alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
4092 // PrintCompressNot64x2Tables
4093 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4094 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
4095 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4096 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4097
4098 const Simd<T, N, 0> d;
4099 const Repartition<uint8_t, decltype(d)> d8;
4100 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
4101}
4102
4103// Helper functions called by both Compress and CompressStore - avoids a
4104// redundant BitsFromMask in the latter.
4105
4106template <typename T, size_t N>
4107HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
4108 const auto idx = detail::IdxFromBits<T, N>(mask_bits);
4109 const DFromV<decltype(v)> d;
4110 const RebindToSigned<decltype(d)> di;
4111 return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
4112}
4113
4114template <typename T, size_t N>
4115HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
4116 const auto idx = detail::IdxFromNotBits<T, N>(mask_bits);
4117 const DFromV<decltype(v)> d;
4118 const RebindToSigned<decltype(d)> di;
4119 return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
4120}
4121
4122} // namespace detail
4123
4124template <typename T>
4125struct CompressIsPartition {
4126#if HWY_TARGET == HWY_WASM_EMU256
4127 enum { value = 0 };
4128#else
4129 enum { value = (sizeof(T) != 1) };
4130#endif
4131};
4132
4133// Single lane: no-op
4134template <typename T>
4135HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
4136 return v;
4137}
4138
4139// Two lanes: conditional swap
4140template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4142 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
4143 const Full128<T> d;
4144 const Vec128<T> m = VecFromMask(d, mask);
4145 const Vec128<T> maskL = DupEven(m);
4146 const Vec128<T> maskH = DupOdd(m);
4147 const Vec128<T> swap = AndNot(maskL, maskH);
4148 return IfVecThenElse(swap, Shuffle01(v), v);
4149}
4150
4151// General case, 2 or 4 byte lanes
4152template <typename T, size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
4153HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
4155}
4156
4157// Single lane: no-op
4158template <typename T>
4159HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
4160 return v;
4161}
4162
4163// Two lanes: conditional swap
4164template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4165HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
4166 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
4167 const Full128<T> d;
4168 const Vec128<T> m = VecFromMask(d, mask);
4169 const Vec128<T> maskL = DupEven(m);
4170 const Vec128<T> maskH = DupOdd(m);
4171 const Vec128<T> swap = AndNot(maskH, maskL);
4172 return IfVecThenElse(swap, Shuffle01(v), v);
4173}
4174
4175// General case, 2 or 4 byte lanes
4176template <typename T, size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
4177HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
4178 // For partial vectors, we cannot pull the Not() into the table because
4179 // BitsFromMask clears the upper bits.
4180 if (N < 16 / sizeof(T)) {
4182 }
4184}
4185
4186// ------------------------------ CompressBlocksNot
4187HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
4188 Mask128<uint64_t> /* m */) {
4189 return v;
4190}
4191
4192// ------------------------------ CompressBits
4193template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
4195 const uint8_t* HWY_RESTRICT bits) {
4196 uint64_t mask_bits = 0;
4197 constexpr size_t kNumBytes = (N + 7) / 8;
4198 CopyBytes<kNumBytes>(bits, &mask_bits);
4199 if (N < 8) {
4200 mask_bits &= (1ull << N) - 1;
4201 }
4202
4203 return detail::Compress(v, mask_bits);
4204}
4205
4206// ------------------------------ CompressStore
4207template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
4208HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
4209 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
4210 const uint64_t mask_bits = detail::BitsFromMask(mask);
4211 const auto c = detail::Compress(v, mask_bits);
4212 StoreU(c, d, unaligned);
4213 return PopCount(mask_bits);
4214}
4215
4216// ------------------------------ CompressBlendedStore
4217template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
4218HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
4219 Simd<T, N, 0> d,
4220 T* HWY_RESTRICT unaligned) {
4221 const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
4222 using TU = TFromD<decltype(du)>;
4223 const uint64_t mask_bits = detail::BitsFromMask(m);
4224 const size_t count = PopCount(mask_bits);
4225 const Vec128<TU, N> compressed = detail::Compress(BitCast(du, v), mask_bits);
4226 const Mask128<T, N> store_mask = RebindMask(d, FirstN(du, count));
4227 BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
4228 return count;
4229}
4230
4231// ------------------------------ CompressBitsStore
4232
4233template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
4234HWY_API size_t CompressBitsStore(Vec128<T, N> v,
4235 const uint8_t* HWY_RESTRICT bits,
4236 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
4237 uint64_t mask_bits = 0;
4238 constexpr size_t kNumBytes = (N + 7) / 8;
4239 CopyBytes<kNumBytes>(bits, &mask_bits);
4240 if (N < 8) {
4241 mask_bits &= (1ull << N) - 1;
4242 }
4243
4244 const auto c = detail::Compress(v, mask_bits);
4245 StoreU(c, d, unaligned);
4246 return PopCount(mask_bits);
4247}
4248
4249// ------------------------------ StoreInterleaved2/3/4
4250
4251// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
4252// generic_ops-inl.h.
4253
4254// ------------------------------ MulEven/Odd (Load)
4255
4256HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
4257 const Vec128<uint64_t> b) {
4258 alignas(16) uint64_t mul[2];
4259 mul[0] =
4260 Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
4261 static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
4262 return Load(Full128<uint64_t>(), mul);
4263}
4264
4265HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
4266 const Vec128<uint64_t> b) {
4267 alignas(16) uint64_t mul[2];
4268 mul[0] =
4269 Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
4270 static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
4271 return Load(Full128<uint64_t>(), mul);
4272}
4273
4274// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
4275
4276template <size_t N>
4277HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
4278 Vec128<bfloat16_t, 2 * N> a,
4279 Vec128<bfloat16_t, 2 * N> b,
4280 const Vec128<float, N> sum0,
4281 Vec128<float, N>& sum1) {
4282 const Rebind<uint32_t, decltype(df32)> du32;
4283 using VU32 = VFromD<decltype(du32)>;
4284 const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
4285 // Using shift/and instead of Zip leads to the odd/even order that
4286 // RearrangeToOddPlusEven prefers.
4287 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
4288 const VU32 ao = And(BitCast(du32, a), odd);
4289 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
4290 const VU32 bo = And(BitCast(du32, b), odd);
4291 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
4292 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
4293}
4294
4295// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
4296// safe.
4297template <size_t N>
4298HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
4299 Simd<int32_t, N, 0> /*d32*/, Vec128<int16_t, 2 * N> a,
4300 Vec128<int16_t, 2 * N> b, const Vec128<int32_t, N> sum0,
4301 Vec128<int32_t, N>& /*sum1*/) {
4302 return sum0 + Vec128<int32_t, N>{wasm_i32x4_dot_i16x8(a.raw, b.raw)};
4303}
4304
4305// ------------------------------ RearrangeToOddPlusEven
4306template <size_t N>
4308 const Vec128<int32_t, N> sum0, const Vec128<int32_t, N> /*sum1*/) {
4309 return sum0; // invariant already holds
4310}
4311
4312template <size_t N>
4313HWY_API Vec128<float, N> RearrangeToOddPlusEven(const Vec128<float, N> sum0,
4314 const Vec128<float, N> sum1) {
4315 return Add(sum0, sum1);
4316}
4317
4318// ------------------------------ Reductions
4319
4320namespace detail {
4321
4322// N=1 for any T: no-op
4323template <typename T>
4324HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
4325 const Vec128<T, 1> v) {
4326 return v;
4327}
4328template <typename T>
4329HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
4330 const Vec128<T, 1> v) {
4331 return v;
4332}
4333template <typename T>
4334HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
4335 const Vec128<T, 1> v) {
4336 return v;
4337}
4338
4339// u32/i32/f32:
4340
4341// N=2
4342template <typename T>
4344 const Vec128<T, 2> v10) {
4345 return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
4346}
4347template <typename T>
4349 const Vec128<T, 2> v10) {
4350 return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
4351}
4352template <typename T>
4353HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
4354 const Vec128<T, 2> v10) {
4355 return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
4356}
4357
4358// N=4 (full)
4359template <typename T>
4361 const Vec128<T> v3210) {
4362 const Vec128<T> v1032 = Shuffle1032(v3210);
4363 const Vec128<T> v31_20_31_20 = v3210 + v1032;
4364 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4365 return v20_31_20_31 + v31_20_31_20;
4366}
4367template <typename T>
4369 const Vec128<T> v3210) {
4370 const Vec128<T> v1032 = Shuffle1032(v3210);
4371 const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
4372 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4373 return Min(v20_31_20_31, v31_20_31_20);
4374}
4375template <typename T>
4376HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
4377 const Vec128<T> v3210) {
4378 const Vec128<T> v1032 = Shuffle1032(v3210);
4379 const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
4380 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4381 return Max(v20_31_20_31, v31_20_31_20);
4382}
4383
4384// u64/i64/f64:
4385
4386// N=2 (full)
4387template <typename T>
4389 const Vec128<T> v10) {
4390 const Vec128<T> v01 = Shuffle01(v10);
4391 return v10 + v01;
4392}
4393template <typename T>
4395 const Vec128<T> v10) {
4396 const Vec128<T> v01 = Shuffle01(v10);
4397 return Min(v10, v01);
4398}
4399template <typename T>
4401 const Vec128<T> v10) {
4402 const Vec128<T> v01 = Shuffle01(v10);
4403 return Max(v10, v01);
4404}
4405
4406template <size_t N, HWY_IF_GE32(uint16_t, N)>
4409 const Simd<uint16_t, N, 0> d;
4410 const RepartitionToWide<decltype(d)> d32;
4411 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4412 const auto odd = ShiftRight<16>(BitCast(d32, v));
4413 const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
4414 // Also broadcast into odd lanes.
4415 return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
4416}
4417template <size_t N, HWY_IF_GE32(int16_t, N)>
4418HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
4419 Vec128<int16_t, N> v) {
4420 const Simd<int16_t, N, 0> d;
4421 const RepartitionToWide<decltype(d)> d32;
4422 // Sign-extend
4423 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
4424 const auto odd = ShiftRight<16>(BitCast(d32, v));
4425 const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
4426 // Also broadcast into odd lanes.
4427 return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
4428}
4429
4430template <size_t N, HWY_IF_GE32(uint16_t, N)>
4431HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
4432 Vec128<uint16_t, N> v) {
4433 const Simd<uint16_t, N, 0> d;
4434 const RepartitionToWide<decltype(d)> d32;
4435 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4436 const auto odd = ShiftRight<16>(BitCast(d32, v));
4437 const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
4438 // Also broadcast into odd lanes.
4439 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
4440}
4441template <size_t N, HWY_IF_GE32(int16_t, N)>
4442HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
4443 Vec128<int16_t, N> v) {
4444 const Simd<int16_t, N, 0> d;
4445 const RepartitionToWide<decltype(d)> d32;
4446 // Sign-extend
4447 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
4448 const auto odd = ShiftRight<16>(BitCast(d32, v));
4449 const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
4450 // Also broadcast into odd lanes.
4451 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
4452}
4453
4454template <size_t N, HWY_IF_GE32(uint16_t, N)>
4455HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
4456 Vec128<uint16_t, N> v) {
4457 const Simd<uint16_t, N, 0> d;
4458 const RepartitionToWide<decltype(d)> d32;
4459 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4460 const auto odd = ShiftRight<16>(BitCast(d32, v));
4461 const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
4462 // Also broadcast into odd lanes.
4463 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
4464}
4465template <size_t N, HWY_IF_GE32(int16_t, N)>
4466HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
4467 Vec128<int16_t, N> v) {
4468 const Simd<int16_t, N, 0> d;
4469 const RepartitionToWide<decltype(d)> d32;
4470 // Sign-extend
4471 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
4472 const auto odd = ShiftRight<16>(BitCast(d32, v));
4473 const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
4474 // Also broadcast into odd lanes.
4475 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
4476}
4477
4478} // namespace detail
4479
4480// Supported for u/i/f 32/64. Returns the same value in each lane.
4481template <typename T, size_t N>
4482HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4483 return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4484}
4485template <typename T, size_t N>
4486HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4487 return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4488}
4489template <typename T, size_t N>
4490HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4491 return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4492}
4493
4494// ------------------------------ Lt128
4495
4496template <typename T, size_t N, HWY_IF_LE128(T, N)>
4497HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
4498 Vec128<T, N> b) {
4499 static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
4500 // Truth table of Eq and Lt for Hi and Lo u64.
4501 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
4502 // =H =L cH cL | out = cH | (=H & cL)
4503 // 0 0 0 0 | 0
4504 // 0 0 0 1 | 0
4505 // 0 0 1 0 | 1
4506 // 0 0 1 1 | 1
4507 // 0 1 0 0 | 0
4508 // 0 1 0 1 | 0
4509 // 0 1 1 0 | 1
4510 // 1 0 0 0 | 0
4511 // 1 0 0 1 | 1
4512 // 1 1 0 0 | 0
4513 const Mask128<T, N> eqHL = Eq(a, b);
4514 const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
4515 // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
4516 // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
4517 // comparison result leftwards requires only 4. IfThenElse compiles to the
4518 // same code as OrAnd().
4519 const Vec128<T, N> ltLx = DupEven(ltHL);
4520 const Vec128<T, N> outHx = IfThenElse(eqHL, ltLx, ltHL);
4521 return MaskFromVec(DupOdd(outHx));
4522}
4523
4524template <typename T, size_t N, HWY_IF_LE128(T, N)>
4525HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
4526 Vec128<T, N> b) {
4527 const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
4528 return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
4529}
4530
4531// ------------------------------ Eq128
4532
4533template <typename T, size_t N, HWY_IF_LE128(T, N)>
4534HWY_INLINE Mask128<T, N> Eq128(Simd<T, N, 0> d, Vec128<T, N> a,
4535 Vec128<T, N> b) {
4536 static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
4537 const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
4538 return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
4539}
4540
4541template <typename T, size_t N, HWY_IF_LE128(T, N)>
4542HWY_INLINE Mask128<T, N> Eq128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
4543 Vec128<T, N> b) {
4544 const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
4545 return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
4546}
4547
4548// ------------------------------ Ne128
4549
4550template <typename T, size_t N, HWY_IF_LE128(T, N)>
4551HWY_INLINE Mask128<T, N> Ne128(Simd<T, N, 0> d, Vec128<T, N> a,
4552 Vec128<T, N> b) {
4553 static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
4554 const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
4555 return MaskFromVec(Or(Reverse2(d, neHL), neHL));
4556}
4557
4558template <typename T, size_t N, HWY_IF_LE128(T, N)>
4559HWY_INLINE Mask128<T, N> Ne128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
4560 Vec128<T, N> b) {
4561 const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
4562 return MaskFromVec(InterleaveUpper(d, neHL, neHL));
4563}
4564
4565// ------------------------------ Min128, Max128 (Lt128)
4566
4567// Without a native OddEven, it seems infeasible to go faster than Lt128.
4568template <class D>
4569HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
4570 return IfThenElse(Lt128(d, a, b), a, b);
4571}
4572
4573template <class D>
4574HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
4575 return IfThenElse(Lt128(d, b, a), a, b);
4576}
4577
4578template <class D>
4579HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
4580 return IfThenElse(Lt128Upper(d, a, b), a, b);
4581}
4582
4583template <class D>
4584HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
4585 return IfThenElse(Lt128Upper(d, b, a), a, b);
4586}
4587
4588// NOLINTNEXTLINE(google-readability-namespace-comments)
4589} // namespace HWY_NAMESPACE
4590} // namespace hwy
#define HWY_MAX(a, b)
Definition base.h:135
#define HWY_RESTRICT
Definition base.h:64
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:78
#define HWY_IF_LE64(T, N)
Definition base.h:407
#define HWY_API
Definition base.h:129
#define HWY_IF_LE128(T, N)
Definition base.h:406
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_INLINE
Definition base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:79
#define HWY_DASSERT(condition)
Definition base.h:238
#define HWY_ASSERT(condition)
Definition base.h:192
#define HWY_IF_UNSIGNED(T)
Definition base.h:414
Definition arm_neon-inl.h:825
detail::Raw128< T >::type raw
Definition wasm_128-inl.h:117
Raw raw
Definition arm_neon-inl.h:835
Definition arm_neon-inl.h:778
T PrivateT
Definition arm_neon-inl.h:782
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition wasm_128-inl.h:83
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:779
Raw raw
Definition arm_neon-inl.h:814
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition wasm_128-inl.h:89
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition wasm_128-inl.h:98
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition wasm_128-inl.h:95
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition wasm_128-inl.h:80
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition wasm_128-inl.h:92
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition wasm_128-inl.h:86
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2413
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5447
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition wasm_128-inl.h:2132
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2451
constexpr __i8x16 BytesAbove()
Definition wasm_128-inl.h:3559
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition wasm_128-inl.h:3661
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:535
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:815
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition arm_neon-inl.h:5902
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition wasm_128-inl.h:1688
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:3023
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6153
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition wasm_128-inl.h:1844
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:130
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:888
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:663
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5063
HWY_INLINE Vec128< T, 1 > SumOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5058
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6162
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5609
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:861
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2432
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:671
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition arm_neon-inl.h:5750
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5068
constexpr uint64_t OnlyActive(uint64_t bits)
Definition arm_neon-inl.h:5589
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4235
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition arm_neon-inl.h:5364
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:1406
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6677
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
Vec128< T, 4/sizeof(T)> Vec32
Definition arm_neon-inl.h:821
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:842
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
typename V::PrivateT TFromV
Definition arm_neon-inl.h:845
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6623
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6651
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:608
Vec128< T, 8/sizeof(T)> Vec64
Definition arm_neon-inl.h:818
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
typename D::T TFromD
Definition ops/shared-inl.h:203
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition base.h:906
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:806
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:924
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:607
typename EnableIfT< Condition >::type EnableIf
Definition base.h:383
HWY_API size_t PopCount(uint64_t x)
Definition base.h:865
typename detail::Relations< T >::Wide MakeWide
Definition base.h:601
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:595
#define HWY_ALIGN
Definition set_macros-inl.h:83
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
@ value
Definition arm_neon-inl.h:5730
Definition arm_neon-inl.h:3968
__v128_u raw
Definition wasm_128-inl.h:2509
Definition ops/shared-inl.h:52
HWY_INLINE __f32x4 operator()(__v128_u v)
Definition wasm_128-inl.h:150
HWY_INLINE __v128_u operator()(__v128_u v)
Definition wasm_128-inl.h:146
__f32x4 type
Definition wasm_128-inl.h:65
Definition x86_128-inl.h:55
__v128_u type
Definition wasm_128-inl.h:61
Definition base.h:435
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()