third_party/highway/hwy/tests/blockwise_test.cc

   1 // Copyright 2019 Google LLC
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15
  16 #include <stddef.h>
  17 #include <stdint.h>
  18 #include <string.h>
  19
  20 #undef HWY_TARGET_INCLUDE
  21 #define HWY_TARGET_INCLUDE "tests/blockwise_test.cc"
  22 #include "hwy/foreach_target.h"  // IWYU pragma: keep
  23 #include "hwy/highway.h"
  24 #include "hwy/tests/test_util-inl.h"
  25
  26 HWY_BEFORE_NAMESPACE();
  27 namespace hwy {
  28 namespace HWY_NAMESPACE {
  29
  30 template <typename D, int kLane>
  31 struct TestBroadcastR {
  32   HWY_NOINLINE void operator()() const {
  33     using T = typename D::T;
  34     const D d;
  35     const size_t N = Lanes(d);
  36     if (kLane >= N) return;
  37     auto in_lanes = AllocateAligned<T>(N);
  38     std::fill(in_lanes.get(), in_lanes.get() + N, T(0));
  39     const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T);
  40     // Need to set within each 128-bit block
  41     for (size_t block = 0; block < N; block += blockN) {
  42       in_lanes[block + kLane] = static_cast<T>(block + 1);
  43     }
  44     const auto in = Load(d, in_lanes.get());
  45     auto expected = AllocateAligned<T>(N);
  46     for (size_t block = 0; block < N; block += blockN) {
  47       for (size_t i = 0; i < blockN; ++i) {
  48         expected[block + i] = T(block + 1);
  49       }
  50     }
  51     HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast<kLane>(in));
  52
  53     TestBroadcastR<D, kLane - 1>()();
  54   }
  55 };
  56
  57 template <class D>
  58 struct TestBroadcastR<D, -1> {
  59   void operator()() const {}
  60 };
  61
  62 struct TestBroadcast {
  63   template <class T, class D>
  64   HWY_NOINLINE void operator()(T /*unused*/, D d) {
  65     TestBroadcastR<D, HWY_MIN(MaxLanes(d), 16 / sizeof(T)) - 1>()();
  66   }
  67 };
  68
  69 HWY_NOINLINE void TestAllBroadcast() {
  70   const ForPartialVectors<TestBroadcast> test;
  71   // No u/i8.
  72   test(uint16_t());
  73   test(int16_t());
  74   ForUIF3264(test);
  75 }
  76
  77 template <bool kFull>
  78 struct ChooseTableSize {
  79   template <typename T, typename DIdx>
  80   using type = DIdx;
  81 };
  82 template <>
  83 struct ChooseTableSize<true> {
  84   template <typename T, typename DIdx>
  85   using type = ScalableTag<T>;
  86 };
  87
  88 template <bool kFull>
  89 struct TestTableLookupBytes {
  90   template <class T, class D>
  91   HWY_NOINLINE void operator()(T /*unused*/, D d) {
  92 #if HWY_TARGET != HWY_SCALAR
  93     RandomState rng;
  94
  95     const typename ChooseTableSize<kFull>::template type<T, D> d_tbl;
  96     const Repartition<uint8_t, decltype(d_tbl)> d_tbl8;
  97     const size_t NT8 = Lanes(d_tbl8);
  98
  99     const Repartition<uint8_t, D> d8;
 100     const size_t N8 = Lanes(d8);
 101
 102     // Random input bytes
 103     auto in_bytes = AllocateAligned<uint8_t>(NT8);
 104     for (size_t i = 0; i < NT8; ++i) {
 105       in_bytes[i] = Random32(&rng) & 0xFF;
 106     }
 107     const auto in = BitCast(d_tbl, Load(d_tbl8, in_bytes.get()));
 108
 109     // Enough test data; for larger vectors, upper lanes will be zero.
 110     const uint8_t index_bytes_source[64] = {
 111         // Same index as source, multiple outputs from same input,
 112         // unused input (9), ascending/descending and nonconsecutive neighbors.
 113         0,  2,  1, 2, 15, 12, 13, 14, 6,  7,  8,  5,  4,  3,  10, 11,
 114         11, 10, 3, 4, 5,  8,  7,  6,  14, 13, 12, 15, 2,  1,  2,  0,
 115         4,  3,  2, 2, 5,  6,  7,  7,  15, 15, 15, 15, 15, 15, 0,  1};
 116     auto index_bytes = AllocateAligned<uint8_t>(N8);
 117     const size_t max_index = HWY_MIN(NT8, 16) - 1;
 118     for (size_t i = 0; i < N8; ++i) {
 119       index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0;
 120       // Avoid asan error for partial vectors.
 121       index_bytes[i] = static_cast<uint8_t>(HWY_MIN(index_bytes[i], max_index));
 122     }
 123     const auto indices = Load(d, reinterpret_cast<const T*>(index_bytes.get()));
 124
 125     const size_t N = Lanes(d);
 126     auto expected = AllocateAligned<T>(N);
 127     uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
 128
 129     for (size_t block = 0; block < N8; block += 16) {
 130       for (size_t i = 0; i < 16 && (block + i) < N8; ++i) {
 131         const uint8_t index = index_bytes[block + i];
 132         HWY_ASSERT(index <= max_index);
 133         // Note that block + index may exceed NT8 on RVV, which is fine because
 134         // the operation uses the larger of the table and index vector size.
 135         HWY_ASSERT(block + index < HWY_MAX(N8, NT8));
 136         // For large vectors, the lane index may wrap around due to block,
 137         // also wrap around after 8-bit overflow.
 138         expected_bytes[block + i] =
 139             in_bytes[(block + index) % HWY_MIN(NT8, 256)];
 140       }
 141     }
 142     HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices));
 143
 144     // Individually test zeroing each byte position.
 145     for (size_t i = 0; i < N8; ++i) {
 146       const uint8_t prev_expected = expected_bytes[i];
 147       const uint8_t prev_index = index_bytes[i];
 148       expected_bytes[i] = 0;
 149
 150       const int idx = 0x80 + (static_cast<int>(Random32(&rng) & 7) << 4);
 151       HWY_ASSERT(0x80 <= idx && idx < 256);
 152       index_bytes[i] = static_cast<uint8_t>(idx);
 153
 154       const auto indices =
 155           Load(d, reinterpret_cast<const T*>(index_bytes.get()));
 156       HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytesOr0(in, indices));
 157       expected_bytes[i] = prev_expected;
 158       index_bytes[i] = prev_index;
 159     }
 160 #else
 161     (void)d;
 162 #endif
 163   }
 164 };
 165
 166 HWY_NOINLINE void TestAllTableLookupBytesSame() {
 167   // Partial index, same-sized table.
 168   ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<false>>());
 169 }
 170
 171 HWY_NOINLINE void TestAllTableLookupBytesMixed() {
 172   // Partial index, full-size table.
 173   ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<true>>());
 174 }
 175
 176 struct TestInterleaveLower {
 177   template <class T, class D>
 178   HWY_NOINLINE void operator()(T /*unused*/, D d) {
 179     using TU = MakeUnsigned<T>;
 180     const size_t N = Lanes(d);
 181     auto even_lanes = AllocateAligned<T>(N);
 182     auto odd_lanes = AllocateAligned<T>(N);
 183     auto expected = AllocateAligned<T>(N);
 184     for (size_t i = 0; i < N; ++i) {
 185       even_lanes[i] = static_cast<T>(2 * i + 0);
 186       odd_lanes[i] = static_cast<T>(2 * i + 1);
 187     }
 188     const auto even = Load(d, even_lanes.get());
 189     const auto odd = Load(d, odd_lanes.get());
 190
 191     const size_t blockN = HWY_MIN(16 / sizeof(T), N);
 192     for (size_t i = 0; i < Lanes(d); ++i) {
 193       const size_t block = i / blockN;
 194       const size_t index = (i % blockN) + block * 2 * blockN;
 195       expected[i] = static_cast<T>(index & LimitsMax<TU>());
 196     }
 197     HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd));
 198     HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(d, even, odd));
 199   }
 200 };
 201
 202 struct TestInterleaveUpper {
 203   template <class T, class D>
 204   HWY_NOINLINE void operator()(T /*unused*/, D d) {
 205     const size_t N = Lanes(d);
 206     if (N == 1) return;
 207     auto even_lanes = AllocateAligned<T>(N);
 208     auto odd_lanes = AllocateAligned<T>(N);
 209     auto expected = AllocateAligned<T>(N);
 210     for (size_t i = 0; i < N; ++i) {
 211       even_lanes[i] = static_cast<T>(2 * i + 0);
 212       odd_lanes[i] = static_cast<T>(2 * i + 1);
 213     }
 214     const auto even = Load(d, even_lanes.get());
 215     const auto odd = Load(d, odd_lanes.get());
 216
 217     const size_t blockN = HWY_MIN(16 / sizeof(T), N);
 218     for (size_t i = 0; i < Lanes(d); ++i) {
 219       const size_t block = i / blockN;
 220       expected[i] = T((i % blockN) + block * 2 * blockN + blockN);
 221     }
 222     HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(d, even, odd));
 223   }
 224 };
 225
 226 HWY_NOINLINE void TestAllInterleave() {
 227   // Not DemoteVectors because this cannot be supported by HWY_SCALAR.
 228   ForAllTypes(ForShrinkableVectors<TestInterleaveLower>());
 229   ForAllTypes(ForShrinkableVectors<TestInterleaveUpper>());
 230 }
 231
 232 struct TestZipLower {
 233   template <class T, class D>
 234   HWY_NOINLINE void operator()(T /*unused*/, D d) {
 235     using WideT = MakeWide<T>;
 236     static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
 237     static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
 238     const size_t N = Lanes(d);
 239     auto even_lanes = AllocateAligned<T>(N);
 240     auto odd_lanes = AllocateAligned<T>(N);
 241     // At least 2 lanes for HWY_SCALAR
 242     auto zip_lanes = AllocateAligned<T>(HWY_MAX(N, 2));
 243     const T kMaxT = LimitsMax<T>();
 244     for (size_t i = 0; i < N; ++i) {
 245       even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT);
 246       odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT);
 247     }
 248     const auto even = Load(d, even_lanes.get());
 249     const auto odd = Load(d, odd_lanes.get());
 250
 251     const Repartition<WideT, D> dw;
 252 #if HWY_TARGET == HWY_SCALAR
 253     // Safely handle big-endian
 254     const auto expected = Set(dw, static_cast<WideT>(1ULL << (sizeof(T) * 8)));
 255 #else
 256     const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N);
 257     for (size_t i = 0; i < N; i += 2) {
 258       const size_t base = (i / blockN) * blockN;
 259       const size_t mod = i % blockN;
 260       zip_lanes[i + 0] = even_lanes[mod / 2 + base];
 261       zip_lanes[i + 1] = odd_lanes[mod / 2 + base];
 262     }
 263     const auto expected =
 264         Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
 265 #endif  // HWY_TARGET == HWY_SCALAR
 266     HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(even, odd));
 267     HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(dw, even, odd));
 268   }
 269 };
 270
 271 HWY_NOINLINE void TestAllZipLower() {
 272   const ForDemoteVectors<TestZipLower> lower_unsigned;
 273   lower_unsigned(uint8_t());
 274   lower_unsigned(uint16_t());
 275 #if HWY_HAVE_INTEGER64
 276   lower_unsigned(uint32_t());  // generates u64
 277 #endif
 278
 279   const ForDemoteVectors<TestZipLower> lower_signed;
 280   lower_signed(int8_t());
 281   lower_signed(int16_t());
 282 #if HWY_HAVE_INTEGER64
 283   lower_signed(int32_t());  // generates i64
 284 #endif
 285
 286   // No float - concatenating f32 does not result in a f64
 287 }
 288
 289 // Remove this test (so it does not show as having run) if the only target is
 290 // HWY_SCALAR, which does not support this op.
 291 #if HWY_TARGETS != HWY_SCALAR
 292
 293 struct TestZipUpper {
 294   template <class T, class D>
 295   HWY_NOINLINE void operator()(T /*unused*/, D d) {
 296 #if HWY_TARGET == HWY_SCALAR
 297     (void)d;
 298 #else
 299     using WideT = MakeWide<T>;
 300     static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
 301     static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
 302     const size_t N = Lanes(d);
 303     if (N < 16 / sizeof(T)) return;
 304     auto even_lanes = AllocateAligned<T>(N);
 305     auto odd_lanes = AllocateAligned<T>(N);
 306     auto zip_lanes = AllocateAligned<T>(N);
 307     const T kMaxT = LimitsMax<T>();
 308     for (size_t i = 0; i < N; ++i) {
 309       even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT);
 310       odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT);
 311     }
 312     const auto even = Load(d, even_lanes.get());
 313     const auto odd = Load(d, odd_lanes.get());
 314
 315     const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N);
 316
 317     for (size_t i = 0; i < N; i += 2) {
 318       const size_t base = (i / blockN) * blockN + blockN / 2;
 319       const size_t mod = i % blockN;
 320       zip_lanes[i + 0] = even_lanes[mod / 2 + base];
 321       zip_lanes[i + 1] = odd_lanes[mod / 2 + base];
 322     }
 323     const Repartition<WideT, D> dw;
 324     const auto expected =
 325         Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
 326     HWY_ASSERT_VEC_EQ(dw, expected, ZipUpper(dw, even, odd));
 327 #endif  // HWY_TARGET == HWY_SCALAR
 328   }
 329 };
 330
 331 HWY_NOINLINE void TestAllZipUpper() {
 332   const ForShrinkableVectors<TestZipUpper> upper_unsigned;
 333   upper_unsigned(uint8_t());
 334   upper_unsigned(uint16_t());
 335 #if HWY_HAVE_INTEGER64
 336   upper_unsigned(uint32_t());  // generates u64
 337 #endif
 338
 339   const ForShrinkableVectors<TestZipUpper> upper_signed;
 340   upper_signed(int8_t());
 341   upper_signed(int16_t());
 342 #if HWY_HAVE_INTEGER64
 343   upper_signed(int32_t());  // generates i64
 344 #endif
 345
 346   // No float - concatenating f32 does not result in a f64
 347 }
 348
 349 #endif  // HWY_TARGETS != HWY_SCALAR
 350
 351 class TestSpecialShuffle32 {
 352  public:
 353   template <class T, class D>
 354   HWY_NOINLINE void operator()(T /*unused*/, D d) {
 355     const auto v = Iota(d, 0);
 356     VerifyLanes32(d, Shuffle2301(v), 2, 3, 0, 1, __FILE__, __LINE__);
 357     VerifyLanes32(d, Shuffle1032(v), 1, 0, 3, 2, __FILE__, __LINE__);
 358     VerifyLanes32(d, Shuffle0321(v), 0, 3, 2, 1, __FILE__, __LINE__);
 359     VerifyLanes32(d, Shuffle2103(v), 2, 1, 0, 3, __FILE__, __LINE__);
 360     VerifyLanes32(d, Shuffle0123(v), 0, 1, 2, 3, __FILE__, __LINE__);
 361   }
 362
 363  private:
 364   // HWY_INLINE works around a Clang SVE compiler bug where all but the first
 365   // 128 bits (the NEON register) of actual are zero.
 366   template <class D, class V>
 367   HWY_INLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
 368                                 const size_t i2, const size_t i1,
 369                                 const size_t i0, const char* filename,
 370                                 const int line) {
 371     using T = TFromD<D>;
 372     constexpr size_t kBlockN = 16 / sizeof(T);
 373     const size_t N = Lanes(d);
 374     if (N < 4) return;
 375     auto expected = AllocateAligned<T>(N);
 376     for (size_t block = 0; block < N; block += kBlockN) {
 377       expected[block + 3] = static_cast<T>(block + i3);
 378       expected[block + 2] = static_cast<T>(block + i2);
 379       expected[block + 1] = static_cast<T>(block + i1);
 380       expected[block + 0] = static_cast<T>(block + i0);
 381     }
 382     AssertVecEqual(d, expected.get(), actual, filename, line);
 383   }
 384 };
 385
 386 class TestSpecialShuffle64 {
 387  public:
 388   template <class T, class D>
 389   HWY_NOINLINE void operator()(T /*unused*/, D d) {
 390     const auto v = Iota(d, 0);
 391     VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__);
 392   }
 393
 394  private:
 395   // HWY_INLINE works around a Clang SVE compiler bug where all but the first
 396   // 128 bits (the NEON register) of actual are zero.
 397   template <class D, class V>
 398   HWY_INLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
 399                                 const size_t i0, const char* filename,
 400                                 const int line) {
 401     using T = TFromD<D>;
 402     constexpr size_t kBlockN = 16 / sizeof(T);
 403     const size_t N = Lanes(d);
 404     if (N < 2) return;
 405     auto expected = AllocateAligned<T>(N);
 406     for (size_t block = 0; block < N; block += kBlockN) {
 407       expected[block + 1] = static_cast<T>(block + i1);
 408       expected[block + 0] = static_cast<T>(block + i0);
 409     }
 410     AssertVecEqual(d, expected.get(), actual, filename, line);
 411   }
 412 };
 413
 414 HWY_NOINLINE void TestAllSpecialShuffles() {
 415   const ForGEVectors<128, TestSpecialShuffle32> test32;
 416   test32(uint32_t());
 417   test32(int32_t());
 418   test32(float());
 419
 420 #if HWY_HAVE_INTEGER64
 421   const ForGEVectors<128, TestSpecialShuffle64> test64;
 422   test64(uint64_t());
 423   test64(int64_t());
 424 #endif
 425
 426 #if HWY_HAVE_FLOAT64
 427   const ForGEVectors<128, TestSpecialShuffle64> test_d;
 428   test_d(double());
 429 #endif
 430 }
 431
 432 // NOLINTNEXTLINE(google-readability-namespace-comments)
 433 }  // namespace HWY_NAMESPACE
 434 }  // namespace hwy
 435 HWY_AFTER_NAMESPACE();
 436
 437 #if HWY_ONCE
 438
 439 namespace hwy {
 440 HWY_BEFORE_TEST(HwyBlockwiseTest);
 441 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllBroadcast);
 442 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesSame);
 443 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesMixed);
 444 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave);
 445 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipLower);
 446 #if HWY_TARGETS != HWY_SCALAR
 447 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipUpper);
 448 #endif
 449 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles);
 450 }  // namespace hwy
 451
 452 #endif