1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
20 #undef HWY_TARGET_INCLUDE
21 #define HWY_TARGET_INCLUDE "tests/blockwise_test.cc"
22 #include "hwy/foreach_target.h" // IWYU pragma: keep
23 #include "hwy/highway.h"
24 #include "hwy/tests/test_util-inl.h"
26 HWY_BEFORE_NAMESPACE();
28 namespace HWY_NAMESPACE {
30 template <typename D, int kLane>
31 struct TestBroadcastR {
32 HWY_NOINLINE void operator()() const {
33 using T = typename D::T;
35 const size_t N = Lanes(d);
36 if (kLane >= N) return;
37 auto in_lanes = AllocateAligned<T>(N);
38 std::fill(in_lanes.get(), in_lanes.get() + N, T(0));
39 const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T);
40 // Need to set within each 128-bit block
41 for (size_t block = 0; block < N; block += blockN) {
42 in_lanes[block + kLane] = static_cast<T>(block + 1);
44 const auto in = Load(d, in_lanes.get());
45 auto expected = AllocateAligned<T>(N);
46 for (size_t block = 0; block < N; block += blockN) {
47 for (size_t i = 0; i < blockN; ++i) {
48 expected[block + i] = T(block + 1);
51 HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast<kLane>(in));
53 TestBroadcastR<D, kLane - 1>()();
58 struct TestBroadcastR<D, -1> {
59 void operator()() const {}
62 struct TestBroadcast {
63 template <class T, class D>
64 HWY_NOINLINE void operator()(T /*unused*/, D d) {
65 TestBroadcastR<D, HWY_MIN(MaxLanes(d), 16 / sizeof(T)) - 1>()();
69 HWY_NOINLINE void TestAllBroadcast() {
70 const ForPartialVectors<TestBroadcast> test;
78 struct ChooseTableSize {
79 template <typename T, typename DIdx>
83 struct ChooseTableSize<true> {
84 template <typename T, typename DIdx>
85 using type = ScalableTag<T>;
89 struct TestTableLookupBytes {
90 template <class T, class D>
91 HWY_NOINLINE void operator()(T /*unused*/, D d) {
92 #if HWY_TARGET != HWY_SCALAR
95 const typename ChooseTableSize<kFull>::template type<T, D> d_tbl;
96 const Repartition<uint8_t, decltype(d_tbl)> d_tbl8;
97 const size_t NT8 = Lanes(d_tbl8);
99 const Repartition<uint8_t, D> d8;
100 const size_t N8 = Lanes(d8);
102 // Random input bytes
103 auto in_bytes = AllocateAligned<uint8_t>(NT8);
104 for (size_t i = 0; i < NT8; ++i) {
105 in_bytes[i] = Random32(&rng) & 0xFF;
107 const auto in = BitCast(d_tbl, Load(d_tbl8, in_bytes.get()));
109 // Enough test data; for larger vectors, upper lanes will be zero.
110 const uint8_t index_bytes_source[64] = {
111 // Same index as source, multiple outputs from same input,
112 // unused input (9), ascending/descending and nonconsecutive neighbors.
113 0, 2, 1, 2, 15, 12, 13, 14, 6, 7, 8, 5, 4, 3, 10, 11,
114 11, 10, 3, 4, 5, 8, 7, 6, 14, 13, 12, 15, 2, 1, 2, 0,
115 4, 3, 2, 2, 5, 6, 7, 7, 15, 15, 15, 15, 15, 15, 0, 1};
116 auto index_bytes = AllocateAligned<uint8_t>(N8);
117 const size_t max_index = HWY_MIN(NT8, 16) - 1;
118 for (size_t i = 0; i < N8; ++i) {
119 index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0;
120 // Avoid asan error for partial vectors.
121 index_bytes[i] = static_cast<uint8_t>(HWY_MIN(index_bytes[i], max_index));
123 const auto indices = Load(d, reinterpret_cast<const T*>(index_bytes.get()));
125 const size_t N = Lanes(d);
126 auto expected = AllocateAligned<T>(N);
127 uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
129 for (size_t block = 0; block < N8; block += 16) {
130 for (size_t i = 0; i < 16 && (block + i) < N8; ++i) {
131 const uint8_t index = index_bytes[block + i];
132 HWY_ASSERT(index <= max_index);
133 // Note that block + index may exceed NT8 on RVV, which is fine because
134 // the operation uses the larger of the table and index vector size.
135 HWY_ASSERT(block + index < HWY_MAX(N8, NT8));
136 // For large vectors, the lane index may wrap around due to block,
137 // also wrap around after 8-bit overflow.
138 expected_bytes[block + i] =
139 in_bytes[(block + index) % HWY_MIN(NT8, 256)];
142 HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices));
144 // Individually test zeroing each byte position.
145 for (size_t i = 0; i < N8; ++i) {
146 const uint8_t prev_expected = expected_bytes[i];
147 const uint8_t prev_index = index_bytes[i];
148 expected_bytes[i] = 0;
150 const int idx = 0x80 + (static_cast<int>(Random32(&rng) & 7) << 4);
151 HWY_ASSERT(0x80 <= idx && idx < 256);
152 index_bytes[i] = static_cast<uint8_t>(idx);
155 Load(d, reinterpret_cast<const T*>(index_bytes.get()));
156 HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytesOr0(in, indices));
157 expected_bytes[i] = prev_expected;
158 index_bytes[i] = prev_index;
166 HWY_NOINLINE void TestAllTableLookupBytesSame() {
167 // Partial index, same-sized table.
168 ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<false>>());
171 HWY_NOINLINE void TestAllTableLookupBytesMixed() {
172 // Partial index, full-size table.
173 ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<true>>());
176 struct TestInterleaveLower {
177 template <class T, class D>
178 HWY_NOINLINE void operator()(T /*unused*/, D d) {
179 using TU = MakeUnsigned<T>;
180 const size_t N = Lanes(d);
181 auto even_lanes = AllocateAligned<T>(N);
182 auto odd_lanes = AllocateAligned<T>(N);
183 auto expected = AllocateAligned<T>(N);
184 for (size_t i = 0; i < N; ++i) {
185 even_lanes[i] = static_cast<T>(2 * i + 0);
186 odd_lanes[i] = static_cast<T>(2 * i + 1);
188 const auto even = Load(d, even_lanes.get());
189 const auto odd = Load(d, odd_lanes.get());
191 const size_t blockN = HWY_MIN(16 / sizeof(T), N);
192 for (size_t i = 0; i < Lanes(d); ++i) {
193 const size_t block = i / blockN;
194 const size_t index = (i % blockN) + block * 2 * blockN;
195 expected[i] = static_cast<T>(index & LimitsMax<TU>());
197 HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd));
198 HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(d, even, odd));
202 struct TestInterleaveUpper {
203 template <class T, class D>
204 HWY_NOINLINE void operator()(T /*unused*/, D d) {
205 const size_t N = Lanes(d);
207 auto even_lanes = AllocateAligned<T>(N);
208 auto odd_lanes = AllocateAligned<T>(N);
209 auto expected = AllocateAligned<T>(N);
210 for (size_t i = 0; i < N; ++i) {
211 even_lanes[i] = static_cast<T>(2 * i + 0);
212 odd_lanes[i] = static_cast<T>(2 * i + 1);
214 const auto even = Load(d, even_lanes.get());
215 const auto odd = Load(d, odd_lanes.get());
217 const size_t blockN = HWY_MIN(16 / sizeof(T), N);
218 for (size_t i = 0; i < Lanes(d); ++i) {
219 const size_t block = i / blockN;
220 expected[i] = T((i % blockN) + block * 2 * blockN + blockN);
222 HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(d, even, odd));
226 HWY_NOINLINE void TestAllInterleave() {
227 // Not DemoteVectors because this cannot be supported by HWY_SCALAR.
228 ForAllTypes(ForShrinkableVectors<TestInterleaveLower>());
229 ForAllTypes(ForShrinkableVectors<TestInterleaveUpper>());
232 struct TestZipLower {
233 template <class T, class D>
234 HWY_NOINLINE void operator()(T /*unused*/, D d) {
235 using WideT = MakeWide<T>;
236 static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
237 static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
238 const size_t N = Lanes(d);
239 auto even_lanes = AllocateAligned<T>(N);
240 auto odd_lanes = AllocateAligned<T>(N);
241 // At least 2 lanes for HWY_SCALAR
242 auto zip_lanes = AllocateAligned<T>(HWY_MAX(N, 2));
243 const T kMaxT = LimitsMax<T>();
244 for (size_t i = 0; i < N; ++i) {
245 even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT);
246 odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT);
248 const auto even = Load(d, even_lanes.get());
249 const auto odd = Load(d, odd_lanes.get());
251 const Repartition<WideT, D> dw;
252 #if HWY_TARGET == HWY_SCALAR
253 // Safely handle big-endian
254 const auto expected = Set(dw, static_cast<WideT>(1ULL << (sizeof(T) * 8)));
256 const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N);
257 for (size_t i = 0; i < N; i += 2) {
258 const size_t base = (i / blockN) * blockN;
259 const size_t mod = i % blockN;
260 zip_lanes[i + 0] = even_lanes[mod / 2 + base];
261 zip_lanes[i + 1] = odd_lanes[mod / 2 + base];
263 const auto expected =
264 Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
265 #endif // HWY_TARGET == HWY_SCALAR
266 HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(even, odd));
267 HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(dw, even, odd));
271 HWY_NOINLINE void TestAllZipLower() {
272 const ForDemoteVectors<TestZipLower> lower_unsigned;
273 lower_unsigned(uint8_t());
274 lower_unsigned(uint16_t());
275 #if HWY_HAVE_INTEGER64
276 lower_unsigned(uint32_t()); // generates u64
279 const ForDemoteVectors<TestZipLower> lower_signed;
280 lower_signed(int8_t());
281 lower_signed(int16_t());
282 #if HWY_HAVE_INTEGER64
283 lower_signed(int32_t()); // generates i64
286 // No float - concatenating f32 does not result in a f64
289 // Remove this test (so it does not show as having run) if the only target is
290 // HWY_SCALAR, which does not support this op.
291 #if HWY_TARGETS != HWY_SCALAR
293 struct TestZipUpper {
294 template <class T, class D>
295 HWY_NOINLINE void operator()(T /*unused*/, D d) {
296 #if HWY_TARGET == HWY_SCALAR
299 using WideT = MakeWide<T>;
300 static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
301 static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
302 const size_t N = Lanes(d);
303 if (N < 16 / sizeof(T)) return;
304 auto even_lanes = AllocateAligned<T>(N);
305 auto odd_lanes = AllocateAligned<T>(N);
306 auto zip_lanes = AllocateAligned<T>(N);
307 const T kMaxT = LimitsMax<T>();
308 for (size_t i = 0; i < N; ++i) {
309 even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT);
310 odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT);
312 const auto even = Load(d, even_lanes.get());
313 const auto odd = Load(d, odd_lanes.get());
315 const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N);
317 for (size_t i = 0; i < N; i += 2) {
318 const size_t base = (i / blockN) * blockN + blockN / 2;
319 const size_t mod = i % blockN;
320 zip_lanes[i + 0] = even_lanes[mod / 2 + base];
321 zip_lanes[i + 1] = odd_lanes[mod / 2 + base];
323 const Repartition<WideT, D> dw;
324 const auto expected =
325 Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
326 HWY_ASSERT_VEC_EQ(dw, expected, ZipUpper(dw, even, odd));
327 #endif // HWY_TARGET == HWY_SCALAR
331 HWY_NOINLINE void TestAllZipUpper() {
332 const ForShrinkableVectors<TestZipUpper> upper_unsigned;
333 upper_unsigned(uint8_t());
334 upper_unsigned(uint16_t());
335 #if HWY_HAVE_INTEGER64
336 upper_unsigned(uint32_t()); // generates u64
339 const ForShrinkableVectors<TestZipUpper> upper_signed;
340 upper_signed(int8_t());
341 upper_signed(int16_t());
342 #if HWY_HAVE_INTEGER64
343 upper_signed(int32_t()); // generates i64
346 // No float - concatenating f32 does not result in a f64
349 #endif // HWY_TARGETS != HWY_SCALAR
351 class TestSpecialShuffle32 {
353 template <class T, class D>
354 HWY_NOINLINE void operator()(T /*unused*/, D d) {
355 const auto v = Iota(d, 0);
356 VerifyLanes32(d, Shuffle2301(v), 2, 3, 0, 1, __FILE__, __LINE__);
357 VerifyLanes32(d, Shuffle1032(v), 1, 0, 3, 2, __FILE__, __LINE__);
358 VerifyLanes32(d, Shuffle0321(v), 0, 3, 2, 1, __FILE__, __LINE__);
359 VerifyLanes32(d, Shuffle2103(v), 2, 1, 0, 3, __FILE__, __LINE__);
360 VerifyLanes32(d, Shuffle0123(v), 0, 1, 2, 3, __FILE__, __LINE__);
364 // HWY_INLINE works around a Clang SVE compiler bug where all but the first
365 // 128 bits (the NEON register) of actual are zero.
366 template <class D, class V>
367 HWY_INLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
368 const size_t i2, const size_t i1,
369 const size_t i0, const char* filename,
372 constexpr size_t kBlockN = 16 / sizeof(T);
373 const size_t N = Lanes(d);
375 auto expected = AllocateAligned<T>(N);
376 for (size_t block = 0; block < N; block += kBlockN) {
377 expected[block + 3] = static_cast<T>(block + i3);
378 expected[block + 2] = static_cast<T>(block + i2);
379 expected[block + 1] = static_cast<T>(block + i1);
380 expected[block + 0] = static_cast<T>(block + i0);
382 AssertVecEqual(d, expected.get(), actual, filename, line);
386 class TestSpecialShuffle64 {
388 template <class T, class D>
389 HWY_NOINLINE void operator()(T /*unused*/, D d) {
390 const auto v = Iota(d, 0);
391 VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__);
395 // HWY_INLINE works around a Clang SVE compiler bug where all but the first
396 // 128 bits (the NEON register) of actual are zero.
397 template <class D, class V>
398 HWY_INLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
399 const size_t i0, const char* filename,
402 constexpr size_t kBlockN = 16 / sizeof(T);
403 const size_t N = Lanes(d);
405 auto expected = AllocateAligned<T>(N);
406 for (size_t block = 0; block < N; block += kBlockN) {
407 expected[block + 1] = static_cast<T>(block + i1);
408 expected[block + 0] = static_cast<T>(block + i0);
410 AssertVecEqual(d, expected.get(), actual, filename, line);
414 HWY_NOINLINE void TestAllSpecialShuffles() {
415 const ForGEVectors<128, TestSpecialShuffle32> test32;
420 #if HWY_HAVE_INTEGER64
421 const ForGEVectors<128, TestSpecialShuffle64> test64;
427 const ForGEVectors<128, TestSpecialShuffle64> test_d;
432 // NOLINTNEXTLINE(google-readability-namespace-comments)
433 } // namespace HWY_NAMESPACE
435 HWY_AFTER_NAMESPACE();
440 HWY_BEFORE_TEST(HwyBlockwiseTest);
441 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllBroadcast);
442 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesSame);
443 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesMixed);
444 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave);
445 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipLower);
446 #if HWY_TARGETS != HWY_SCALAR
447 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipUpper);
449 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles);