diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
index 8220e9b718..2ee3dad3a9 100644
--- a/g3doc/quick_reference.md
+++ b/g3doc/quick_reference.md
@@ -1050,6 +1050,9 @@ types, and on SVE/RVV.
* V **AndNot**(V a, V b)
: returns `~a[i] & b[i]`.
+* V **MaskedOrOrZero**(M m, V a, V b)
: returns `a[i] || b[i]`
+ or `zero` if `m[i]` is false.
+
The following three-argument functions may be more efficient than assembling
them from 2-argument functions:
@@ -2237,6 +2240,22 @@ The following `ReverseN` must not be called if `Lanes(D()) < N`:
must be in the range `[0, 2 * Lanes(d))` but need not be unique. The index
type `TI` must be an integer of the same size as `TFromD`.
+* V **TableLookupLanesOr**(M m, V a, V b, unspecified)
returns the
+ result of `TableLookupLanes(a, unspecified)` where `m[i]` is true, and returns
+ `b[i]` where `m[i]` is false.
+
+* V **TableLookupLanesOrZero**(M m, V a, unspecified)
returns
+ the result of `TableLookupLanes(a, unspecified)` where `m[i]` is true, and
+ returns zero where `m[i]` is false.
+
+* V **TwoTablesLookupLanesOr**(D d, M m, V a, V b, unspecified)
+ returns the result of `TwoTablesLookupLanes(V a, V b, unspecified)` where
+ `m[i]` is true, and `a[i]` where `m[i]` is false.
+
+* V **TwoTablesLookupLanesOrZero**(D d, M m, V a, V b, unspecified)
+ returns the result of `TwoTablesLookupLanes(V a, V b, unspecified)` where
+ `m[i]` is true, and zero where `m[i]` is false.
+
* V **Per4LaneBlockShuffle**<size_t kIdx3, size_t kIdx2, size_t
kIdx1, size_t kIdx0>(V v)
does a per 4-lane block shuffle of `v`
if `Lanes(DFromV())` is greater than or equal to 4 or a shuffle of the
@@ -2377,6 +2396,24 @@ more efficient on some targets.
* T **ReduceMin**(D, V v)
: returns the minimum of all lanes.
* T **ReduceMax**(D, V v)
: returns the maximum of all lanes.
+### Masked reductions
+
+**Note**: Horizontal operations (across lanes of the same vector) such as
+reductions are slower than normal SIMD operations and are typically used outside
+critical loops.
+
+All ops in this section ignore lanes where `mask=false`. These are equivalent
+to, and potentially more efficient than, `GetLane(SumOfLanes(d,
+IfThenElseZero(m, v)))` etc. The result is implementation-defined when all mask
+elements are false.
+
+* T **MaskedReduceSum**(D, M m, V v)
: returns the sum of all lanes
+ where `m[i]` is `true`.
+* T **MaskedReduceMin**(D, M m, V v)
: returns the minimum of all
+ lanes where `m[i]` is `true`.
+* T **MaskedReduceMax**(D, M m, V v)
: returns the maximum of all
+ lanes where `m[i]` is `true`.
+
### Crypto
Ops in this section are only available if `HWY_TARGET != HWY_SCALAR`:
diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h
index 2dde1479de..66ad1dfe3b 100644
--- a/hwy/ops/arm_sve-inl.h
+++ b/hwy/ops/arm_sve-inl.h
@@ -219,6 +219,15 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _)
HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
return sv##OP##_##CHAR##BITS(v); \
}
+#define HWY_SVE_RETV_ARGMV_M(BASE, CHAR, BITS, HALF, NAME, OP) \
+ HWY_API HWY_SVE_V(BASE, BITS) \
+ NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
+ return sv##OP##_##CHAR##BITS##_m(b, m, a); \
+ }
+#define HWY_SVE_RETV_ARGMV_Z(BASE, CHAR, BITS, HALF, NAME, OP) \
+ HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a) { \
+ return sv##OP##_##CHAR##BITS##_z(m, a); \
+ }
// vector = f(vector, scalar), e.g. detail::AddN
#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \
@@ -252,6 +261,17 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _)
NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
return sv##OP##_##CHAR##BITS##_x(m, a, b); \
}
+#define HWY_SVE_RETV_ARGMVV_M(BASE, CHAR, BITS, HALF, NAME, OP) \
+ HWY_API HWY_SVE_V(BASE, BITS) \
+ NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
+ return sv##OP##_##CHAR##BITS##_m(m, a, b); \
+ }
+// User-specified mask. Mask=false value is zero.
+#define HWY_SVE_RETV_ARGMVVZ(BASE, CHAR, BITS, HALF, NAME, OP) \
+ HWY_API HWY_SVE_V(BASE, BITS) \
+ NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
+ return sv##OP##_##CHAR##BITS##_z(m, a, b); \
+ }
#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
HWY_API HWY_SVE_V(BASE, BITS) \
@@ -260,6 +280,13 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _)
return sv##OP##_##CHAR##BITS(a, b, c); \
}
+#define HWY_SVE_RETV_ARGMVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
+ HWY_API HWY_SVE_V(BASE, BITS) \
+ NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \
+ HWY_SVE_V(BASE, BITS) c) { \
+ return sv##OP##_##CHAR##BITS##_m(m, a, b, c); \
+ }
+
// ------------------------------ Lanes
namespace detail {
@@ -727,6 +754,9 @@ HWY_API V Or(const V a, const V b) {
return BitCast(df, Or(BitCast(du, a), BitCast(du, b)));
}
+// ------------------------------ MaskedOrOrZero
+HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVVZ, MaskedOrOrZero, orr)
+
// ------------------------------ Xor
namespace detail {
@@ -3288,6 +3318,25 @@ HWY_API TFromD ReduceMax(D d, VFromD v) {
return detail::MaxOfLanesM(detail::MakeMask(d), v);
}
+#ifdef HWY_NATIVE_MASKED_REDUCE_SCALAR
+#undef HWY_NATIVE_MASKED_REDUCE_SCALAR
+#else
+#define HWY_NATIVE_MASKED_REDUCE_SCALAR
+#endif
+
+template
+HWY_API TFromD MaskedReduceSum(D /*d*/, M m, VFromD v) {
+ return detail::SumOfLanesM(m, v);
+}
+template
+HWY_API TFromD MaskedReduceMin(D /*d*/, M m, VFromD v) {
+ return detail::MinOfLanesM(m, v);
+}
+template
+HWY_API TFromD MaskedReduceMax(D /*d*/, M m, VFromD v) {
+ return detail::MaxOfLanesM(m, v);
+}
+
// ------------------------------ SumOfLanes
template
@@ -4755,6 +4804,23 @@ HWY_API V IfNegativeThenElse(V v, V yes, V no) {
static_assert(IsSigned>(), "Only works for signed/float");
return IfThenElse(IsNegative(v), yes, no);
}
+// ------------------------------ IfNegativeThenNegOrUndefIfZero
+
+#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
+#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
+#else
+#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
+#endif
+
+#define HWY_SVE_NEG_IF(BASE, CHAR, BITS, HALF, NAME, OP) \
+ HWY_API HWY_SVE_V(BASE, BITS) \
+ NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) v) { \
+ return sv##OP##_##CHAR##BITS##_m(v, IsNegative(mask), v); \
+ }
+
+HWY_SVE_FOREACH_IF(HWY_SVE_NEG_IF, IfNegativeThenNegOrUndefIfZero, neg)
+
+#undef HWY_SVE_NEG_IF
// ------------------------------ AverageRound (ShiftRight)
@@ -6291,13 +6357,19 @@ HWY_API V HighestSetBitIndex(V v) {
#undef HWY_SVE_IF_NOT_EMULATED_D
#undef HWY_SVE_PTRUE
#undef HWY_SVE_RETV_ARGMVV
+#undef HWY_SVE_RETV_ARGMVVZ
#undef HWY_SVE_RETV_ARGPV
#undef HWY_SVE_RETV_ARGPVN
#undef HWY_SVE_RETV_ARGPVV
#undef HWY_SVE_RETV_ARGV
#undef HWY_SVE_RETV_ARGVN
+#undef HWY_SVE_RETV_ARGMV
+#undef HWY_SVE_RETV_ARGMV_M
+#undef HWY_SVE_RETV_ARGMV_Z
#undef HWY_SVE_RETV_ARGVV
+#undef HWY_SVE_RETV_ARGMVV_M
#undef HWY_SVE_RETV_ARGVVV
+#undef HWY_SVE_RETV_ARGMVVV
#undef HWY_SVE_T
#undef HWY_SVE_UNDEFINED
#undef HWY_SVE_V
diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h
index 99b518d99c..efee4bc971 100644
--- a/hwy/ops/generic_ops-inl.h
+++ b/hwy/ops/generic_ops-inl.h
@@ -882,6 +882,28 @@ HWY_API TFromD ReduceMax(D d, VFromD v) {
}
#endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8
+#if (defined(HWY_NATIVE_MASKED_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_MASKED_REDUCE_SCALAR
+#undef HWY_NATIVE_MASKED_REDUCE_SCALAR
+#else
+#define HWY_NATIVE_MASKED_REDUCE_SCALAR
+#endif
+
+template
+HWY_API TFromD MaskedReduceSum(D d, M m, VFromD v) {
+ return ReduceSum(d, IfThenElseZero(m, v));
+}
+template
+HWY_API TFromD MaskedReduceMin(D d, M m, VFromD v) {
+ return ReduceMin(d, IfThenElse(m, v, MaxOfLanes(d, v)));
+}
+template
+HWY_API TFromD MaskedReduceMax(D d, M m, VFromD v) {
+ return ReduceMax(d, IfThenElseZero(m, v));
+}
+
+#endif // HWY_NATIVE_MASKED_REDUCE_SCALAR
+
// ------------------------------ IsEitherNaN
#if (defined(HWY_NATIVE_IS_EITHER_NAN) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_IS_EITHER_NAN
@@ -6444,6 +6466,30 @@ HWY_API V ReverseBits(V v) {
}
#endif // HWY_NATIVE_REVERSE_BITS_UI16_32_64
+// ------------------------------ TableLookupLanesOr
+template
+HWY_API V TableLookupLanesOr(M m, V a, V b, IndicesFromD> idx) {
+ return IfThenElse(m, TableLookupLanes(a, idx), b);
+}
+
+// ------------------------------ TableLookupLanesOrZero
+template
+HWY_API V TableLookupLanesOrZero(M m, V a, IndicesFromD> idx) {
+ return IfThenElseZero(m, TableLookupLanes(a, idx));
+}
+
+// ------------------------------ TwoTablesLookupLanesOr
+template
+HWY_API V TwoTablesLookupLanesOr(D d, M m, V a, V b, IndicesFromD idx) {
+ return IfThenElse(m, TwoTablesLookupLanes(d, a, b, idx), a);
+}
+
+// ------------------------------ TwoTablesLookupLanesOrZero
+template
+HWY_API V TwoTablesLookupLanesOrZero(D d, M m, V a, V b, IndicesFromD idx) {
+ return IfThenElse(m, TwoTablesLookupLanes(d, a, b, idx), Zero(d));
+}
+
// ------------------------------ Per4LaneBlockShuffle
#if (defined(HWY_NATIVE_PER4LANEBLKSHUF_DUP32) == defined(HWY_TARGET_TOGGLE))
@@ -7299,6 +7345,10 @@ HWY_API V BitShuffle(V v, VI idx) {
#endif // HWY_NATIVE_BITSHUFFLE
+template
+HWY_API V MaskedOrOrZero(M m, V a, V b) {
+ return IfThenElseZero(m, Or(a, b));
+}
// ================================================== Operator wrapper
// SVE* and RVV currently cannot define operators and have already defined
diff --git a/hwy/tests/logical_test.cc b/hwy/tests/logical_test.cc
index ecd7589c9e..5abc2277bc 100644
--- a/hwy/tests/logical_test.cc
+++ b/hwy/tests/logical_test.cc
@@ -146,6 +146,28 @@ HWY_NOINLINE void TestAllTestBit() {
ForIntegerTypes(ForPartialVectors());
}
+struct TestMaskedOrOrZero {
+ template
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const MFromD all_true = MaskTrue(d);
+ const auto v1 = Iota(d, 1);
+ const auto v2 = Iota(d, 2);
+
+ HWY_ASSERT_VEC_EQ(d, Or(v2, v1), MaskedOrOrZero(all_true, v1, v2));
+
+ const MFromD first_five = FirstN(d, 5);
+ const Vec v0 = Zero(d);
+
+ const Vec v1_exp = IfThenElse(first_five, Or(v2, v1), v0);
+
+ HWY_ASSERT_VEC_EQ(d, v1_exp, MaskedOrOrZero(first_five, v1, v2));
+ }
+};
+
+HWY_NOINLINE void TestAllMaskedLogical() {
+ ForAllTypes(ForPartialVectors());
+}
+
} // namespace
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
@@ -159,6 +181,7 @@ HWY_BEFORE_TEST(HwyLogicalTest);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllNot);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogical);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllMaskedLogical);
HWY_AFTER_TEST();
} // namespace
} // namespace hwy
diff --git a/hwy/tests/reduction_test.cc b/hwy/tests/reduction_test.cc
index fffc4a7873..fd35f645f6 100644
--- a/hwy/tests/reduction_test.cc
+++ b/hwy/tests/reduction_test.cc
@@ -352,6 +352,122 @@ HWY_NOINLINE void TestAllSumsOf8() {
ForGEVectors<64, TestSumsOf8>()(uint8_t());
}
+struct TestMaskedReduceSum {
+ template
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ RandomState rng;
+
+ const Vec v2 = Iota(d, 2);
+
+ const size_t N = Lanes(d);
+ auto bool_lanes = AllocateAligned(N);
+ HWY_ASSERT(bool_lanes);
+
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ T expected = 0;
+ for (size_t i = 0; i < N; ++i) {
+ bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0);
+ if (bool_lanes[i]) {
+ expected += ConvertScalarTo(i + 2);
+ }
+ }
+
+ const Vec mask_i = Load(d, bool_lanes.get());
+ const Mask mask = RebindMask(d, Gt(mask_i, Zero(d)));
+
+ // If all elements are disabled the result is implementation defined
+ if (AllFalse(d, mask)) {
+ continue;
+ }
+
+ HWY_ASSERT_EQ(expected, MaskedReduceSum(d, mask, v2));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllMaskedReduceSum() {
+ ForAllTypes(ForPartialVectors());
+}
+
+struct TestMaskedReduceMin {
+ template
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ RandomState rng;
+
+ const Vec v2 = Iota(d, 2);
+
+ const size_t N = Lanes(d);
+ auto bool_lanes = AllocateAligned(N);
+ HWY_ASSERT(bool_lanes);
+
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ T expected =
+ ConvertScalarTo(N + 3); // larger than any values in the vector
+ for (size_t i = 0; i < N; ++i) {
+ bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0);
+ if (bool_lanes[i]) {
+ if (expected > ConvertScalarTo(i + 2)) {
+ expected = ConvertScalarTo(i + 2);
+ }
+ }
+ }
+
+ const Vec mask_i = Load(d, bool_lanes.get());
+ const Mask mask = RebindMask(d, Gt(mask_i, Zero(d)));
+
+ // If all elements are disabled the result is implementation defined
+ if (AllFalse(d, mask)) {
+ continue;
+ }
+
+ HWY_ASSERT_EQ(expected, MaskedReduceMin(d, mask, v2));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllMaskedReduceMin() {
+ ForAllTypes(ForPartialVectors());
+}
+
+struct TestMaskedReduceMax {
+ template
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ RandomState rng;
+
+ const Vec v2 = Iota(d, 2);
+
+ const size_t N = Lanes(d);
+ auto bool_lanes = AllocateAligned(N);
+ HWY_ASSERT(bool_lanes);
+
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ T expected = 0;
+ for (size_t i = 0; i < N; ++i) {
+ bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0);
+ if (bool_lanes[i]) {
+ if (expected < ConvertScalarTo(i + 2)) {
+ expected = ConvertScalarTo(i + 2);
+ }
+ }
+ }
+
+ const Vec mask_i = Load(d, bool_lanes.get());
+ const Mask mask = RebindMask(d, Gt(mask_i, Zero(d)));
+
+ // If all elements are disabled the result is implementation defined
+ if (AllFalse(d, mask)) {
+ continue;
+ }
+
+ HWY_ASSERT_EQ(expected, MaskedReduceMax(d, mask, v2));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllMaskedReduceMax() {
+ ForAllTypes(ForPartialVectors());
+}
+
} // namespace
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
@@ -367,6 +483,10 @@ HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMinMaxOfLanes);
HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf2);
HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf4);
HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf8);
+
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMaskedReduceSum);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMaskedReduceMin);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMaskedReduceMax);
HWY_AFTER_TEST();
} // namespace
} // namespace hwy
diff --git a/hwy/tests/table_test.cc b/hwy/tests/table_test.cc
index 09fdd7eaf6..eb5b1a8644 100644
--- a/hwy/tests/table_test.cc
+++ b/hwy/tests/table_test.cc
@@ -103,6 +103,59 @@ HWY_NOINLINE void TestAllTableLookupLanes() {
ForAllTypes(ForPartialVectors());
}
+struct TestTableLookupLanesOr {
+ template
+#if HWY_TARGET != HWY_SCALARWE
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const RebindToSigned di;
+ using TI = TFromD;
+
+ const size_t N = Lanes(d);
+ // Select indices from N-1 counting down
+ auto indices = IndicesFromVec(
+ d, Sub(Set(di, ConvertScalarTo(N - 1)), Iota(di, 0)));
+
+ auto expected = AllocateAligned(N);
+ auto expected_zero = AllocateAligned(N);
+ auto bool_lanes = AllocateAligned(N);
+ HWY_ASSERT(expected && expected_zero && bool_lanes);
+
+ const auto v1 = Iota(d, 5);
+ const auto v2 = Iota(d, 8);
+
+ RandomState rng;
+
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0);
+
+ if (bool_lanes[i]) {
+ expected[i] = ConvertScalarTo(N - i + 5 - 1); // v1[N-1, N-2, ...]
+ expected_zero[i] =
+ ConvertScalarTo(N - i + 5 - 1); // v1[N-1, N-2, ...]
+ } else {
+ expected[i] = ConvertScalarTo(i + 8); // v2[i]
+ expected_zero[i] = ConvertScalarTo(0);
+ }
+ }
+
+ const Vec mask_i = Load(d, bool_lanes.get());
+ const Mask mask = RebindMask(d, Gt(mask_i, Zero(d)));
+ HWY_ASSERT_VEC_EQ(d, expected.get(),
+ TableLookupLanesOr(mask, v1, v2, indices));
+ HWY_ASSERT_VEC_EQ(d, expected_zero.get(),
+ TableLookupLanesOrZero(mask, v1, indices));
+#else
+ (void) d;
+#endif
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllTableLookupLanesOr() {
+ ForAllTypes(ForPartialVectors());
+}
+
struct TestTwoTablesLookupLanes {
template
HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -194,6 +247,64 @@ HWY_NOINLINE void TestAllTwoTablesLookupLanes() {
ForAllTypes(ForPartialVectors());
}
+struct TestTwoTablesLookupLanesOr {
+ template
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const RebindToSigned di;
+ using TI = TFromD;
+
+ const size_t N = Lanes(d);
+ // Select indices from N-1 counting down
+ auto idx_lower = Sub(Set(di, ConvertScalarTo(N - 1)), Iota(di, 0));
+ auto idx_upper = Add(idx_lower, Set(di, ConvertScalarTo(N)));
+ auto indices = IndicesFromVec(d, OddEven(idx_upper, idx_lower));
+
+ auto expected = AllocateAligned(N);
+ auto expected_zero = AllocateAligned(N);
+ auto bool_lanes = AllocateAligned(N);
+ HWY_ASSERT(expected && expected_zero && bool_lanes);
+
+ const auto v1 = Iota(d, 5);
+ const auto v2 = Iota(d, 8);
+
+ RandomState rng;
+
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0);
+
+ if (bool_lanes[i]) {
+ if (i % 2) {
+ expected[i] =
+ ConvertScalarTo(N - i + 8 - 1); // v2[N-1, N-2, ...]
+ expected_zero[i] =
+ ConvertScalarTo(N - i + 8 - 1); // v2[N-1, N-2, ...]
+ } else {
+ expected[i] =
+ ConvertScalarTo(N - i + 5 - 1); // v1[N-1, N-2, ...]
+ expected_zero[i] =
+ ConvertScalarTo(N - i + 5 - 1); // v1[N-1, N-2, ...]
+ }
+ } else {
+ expected[i] = ConvertScalarTo(i + 5); // v1[i]
+ expected_zero[i] = ConvertScalarTo(0);
+ }
+ }
+
+ const Vec mask_i = Load(d, bool_lanes.get());
+ const Mask mask = RebindMask(d, Gt(mask_i, Zero(d)));
+ HWY_ASSERT_VEC_EQ(d, expected.get(),
+ TwoTablesLookupLanesOr(d, mask, v1, v2, indices));
+ HWY_ASSERT_VEC_EQ(d, expected_zero.get(),
+ TwoTablesLookupLanesOrZero(d, mask, v1, v2, indices));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllTwoTablesLookupLanesOr() {
+ ForAllTypes(ForPartialVectors());
+}
+
} // namespace
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
@@ -205,7 +316,9 @@ namespace hwy {
namespace {
HWY_BEFORE_TEST(HwyTableTest);
HWY_EXPORT_AND_TEST_P(HwyTableTest, TestAllTableLookupLanes);
+HWY_EXPORT_AND_TEST_P(HwyTableTest, TestAllTableLookupLanesOr);
HWY_EXPORT_AND_TEST_P(HwyTableTest, TestAllTwoTablesLookupLanes);
+HWY_EXPORT_AND_TEST_P(HwyTableTest, TestAllTwoTablesLookupLanesOr);
HWY_AFTER_TEST();
} // namespace
} // namespace hwy