const simd_dw<vectorNum> &offset,
char *base_address) {
for (uint32_t i = 0; i < vectorNum; ++i) {
- const int v0 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 0);
- const int v1 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 1);
- const int v2 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 2);
- const int v3 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 3);
+ const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0);
+ const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1);
+ const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2);
+ const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3);
const int o0 = _mm_extract_epi32(offset.m[i], 0);
const int o1 = _mm_extract_epi32(offset.m[i], 1);
const int o2 = _mm_extract_epi32(offset.m[i], 2);
*(int*)(base_address + o3) = v3;
}
}
-
+template <uint32_t vectorNum>
+INLINE void SCATTER(const scalar_dw &value,
+ const simd_dw<vectorNum> &offset,
+ char *base_address) {
+ SCATTER(simd_dw<vectorNum>(value), offset, base_address);
+}
+template <uint32_t vectorNum>
+INLINE void SCATTER(const simd_dw<vectorNum> &value,
+ const scalar_dw &offset,
+ char *base_address) {
+ SCATTER(value, simd_dw<vectorNum>(offset), base_address);
+}
+#include <cstdio>
/* Gather */
template <uint32_t vectorNum>
INLINE void GATHER(simd_dw<vectorNum> &dst,
const simd_dw<vectorNum> &offset,
- char *base_address) {
+ const char *base_address) {
for (uint32_t i = 0; i < vectorNum; ++i) {
const int o0 = _mm_extract_epi32(offset.m[i], 0);
const int o1 = _mm_extract_epi32(offset.m[i], 1);
const int o2 = _mm_extract_epi32(offset.m[i], 2);
const int o3 = _mm_extract_epi32(offset.m[i], 3);
- const int v0 = *(int*)(base_address + o0);
- const int v1 = *(int*)(base_address + o1);
- const int v2 = *(int*)(base_address + o2);
- const int v3 = *(int*)(base_address + o3);
- _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v0, 0);
- _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v1, 1);
- _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v2, 2);
- _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v3, 3);
+ const int v0 = *(const int*)(base_address + o0);
+ const int v1 = *(const int*)(base_address + o1);
+ const int v2 = *(const int*)(base_address + o2);
+ const int v3 = *(const int*)(base_address + o3);
+ dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0));
+ dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1));
+ dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2));
+ dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3));
}
}
+template <uint32_t vectorNum>
+INLINE void GATHER(simd_dw<vectorNum> &dst,
+ const scalar_dw &offset,
+ const char *base_address) {
+ GATHER(dst, simd_dw<vectorNum>(offset), base_address);
+}
//////////////////////////////////////////////////////////////////////////////
// Scalar instructions
INLINE void STORE(scalar_dw src, char *ptr) { *(uint32_t *) ptr = src.u; }
INLINE void LOADI(scalar_dw &dst, uint32_t u) { dst.u = u; }
INLINE void SCATTER(scalar_dw value, scalar_dw offset, char *base) { *(uint32_t*)(base + offset.u) = value.u; }
-INLINE void GATHER(scalar_dw &dst, scalar_dw offset, char *base) { dst.u = *(uint32_t*)(base + offset.u); }
+INLINE void GATHER(scalar_dw &dst, scalar_dw offset, const char *base) { dst.u = *(const uint32_t*)(base + offset.u); }
//////////////////////////////////////////////////////////////////////////////
// Identical instructions are forwarded
" const simd_dw<vectorNum> &offset,\n"
" char *base_address) {\n"
" for (uint32_t i = 0; i < vectorNum; ++i) {\n"
-" const int v0 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 0);\n"
-" const int v1 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 1);\n"
-" const int v2 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 2);\n"
-" const int v3 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 3);\n"
+" const int v0 = _mm_extract_epi32(PS2SI(value.m[i]), 0);\n"
+" const int v1 = _mm_extract_epi32(PS2SI(value.m[i]), 1);\n"
+" const int v2 = _mm_extract_epi32(PS2SI(value.m[i]), 2);\n"
+" const int v3 = _mm_extract_epi32(PS2SI(value.m[i]), 3);\n"
" const int o0 = _mm_extract_epi32(offset.m[i], 0);\n"
" const int o1 = _mm_extract_epi32(offset.m[i], 1);\n"
" const int o2 = _mm_extract_epi32(offset.m[i], 2);\n"
" *(int*)(base_address + o3) = v3;\n"
" }\n"
"}\n"
-"\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void SCATTER(const scalar_dw &value,\n"
+" const simd_dw<vectorNum> &offset,\n"
+" char *base_address) {\n"
+" SCATTER(simd_dw<vectorNum>(value), offset, base_address);\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void SCATTER(const simd_dw<vectorNum> &value,\n"
+" const scalar_dw &offset,\n"
+" char *base_address) {\n"
+" SCATTER(value, simd_dw<vectorNum>(offset), base_address);\n"
+"}\n"
+"#include <cstdio>\n"
"/* Gather */\n"
"template <uint32_t vectorNum>\n"
"INLINE void GATHER(simd_dw<vectorNum> &dst,\n"
" const simd_dw<vectorNum> &offset,\n"
-" char *base_address) {\n"
+" const char *base_address) {\n"
" for (uint32_t i = 0; i < vectorNum; ++i) {\n"
" const int o0 = _mm_extract_epi32(offset.m[i], 0);\n"
" const int o1 = _mm_extract_epi32(offset.m[i], 1);\n"
" const int o2 = _mm_extract_epi32(offset.m[i], 2);\n"
" const int o3 = _mm_extract_epi32(offset.m[i], 3);\n"
-" const int v0 = *(int*)(base_address + o0);\n"
-" const int v1 = *(int*)(base_address + o1);\n"
-" const int v2 = *(int*)(base_address + o2);\n"
-" const int v3 = *(int*)(base_address + o3);\n"
-" _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v0, 0);\n"
-" _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v1, 1);\n"
-" _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v2, 2);\n"
-" _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v3, 3);\n"
+" const int v0 = *(const int*)(base_address + o0);\n"
+" const int v1 = *(const int*)(base_address + o1);\n"
+" const int v2 = *(const int*)(base_address + o2);\n"
+" const int v3 = *(const int*)(base_address + o3);\n"
+" dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v0, 0));\n"
+" dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v1, 1));\n"
+" dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v2, 2));\n"
+" dst.m[i] = SI2PS(_mm_insert_epi32(PS2SI(dst.m[i]), v3, 3));\n"
" }\n"
"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void GATHER(simd_dw<vectorNum> &dst,\n"
+" const scalar_dw &offset,\n"
+" const char *base_address) {\n"
+" GATHER(dst, simd_dw<vectorNum>(offset), base_address);\n"
+"}\n"
"\n"
"//////////////////////////////////////////////////////////////////////////////\n"
"// Scalar instructions\n"
"INLINE void STORE(scalar_dw src, char *ptr) { *(uint32_t *) ptr = src.u; }\n"
"INLINE void LOADI(scalar_dw &dst, uint32_t u) { dst.u = u; }\n"
"INLINE void SCATTER(scalar_dw value, scalar_dw offset, char *base) { *(uint32_t*)(base + offset.u) = value.u; }\n"
-"INLINE void GATHER(scalar_dw &dst, scalar_dw offset, char *base) { dst.u = *(uint32_t*)(base + offset.u); }\n"
+"INLINE void GATHER(scalar_dw &dst, scalar_dw offset, const char *base) { dst.u = *(const uint32_t*)(base + offset.u); }\n"
"\n"
"//////////////////////////////////////////////////////////////////////////////\n"
"// Identical instructions are forwarded\n"
#include "backend/sim/sim_vector.h"
#include "utest/utest.hpp"
-
+#include <algorithm>
static INLINE bool ok(float x, float y) {
return fabs(x-y) / (1.f + std::max(fabs(x), fabs(y))) < 1.e-6;
}
#undef CHECK_CMP_OP
+static void utestScatterGather(void)
+{
+ uint32_t data[64], gatherOffsets[64], scatterOffsets[64], dst[64];
+ simd1dw _0, _0s, _0g, _4, _4s, _4g;
+ simd16dw _1, _1s, _1g, _2, _2s, _2g;
+ simd8dw _6, _6s, _6g, _7, _7s, _7g;
+
+ // Create the value and offset arrays
+ for (uint32_t i = 0; i < 64; ++i) {
+ data[i] = i;
+ scatterOffsets[i] = gatherOffsets[i] = i * sizeof(uint32_t);
+ }
+ for (uint32_t i = 0; i < 63; ++i) {
+ const int gatherIndex = rand() % (63-i)+i+1;
+ const int scatterIndex = rand() % (63-i)+i+1;
+ std::swap(gatherOffsets[i], gatherOffsets[gatherIndex]);
+ std::swap(scatterOffsets[i], scatterOffsets[scatterIndex]);
+ }
+
+#define CHECK_SCATTER_GATHER_OP(INDEX)\
+ LOAD(_##INDEX##g, (const char *) (gatherOffsets+index##INDEX));\
+ LOAD(_##INDEX##s, (const char *) (scatterOffsets+index##INDEX));\
+ GATHER(_##INDEX, _##INDEX##g, (const char *) data);\
+ SCATTER(_##INDEX, _##INDEX##s, (char *) dst);\
+ for (uint32_t i = 0; i < elemNum(_##INDEX); ++i)\
+ GBE_ASSERT(data[gatherOffsets[index##INDEX+i] / sizeof(uint32_t)] ==\
+ dst[scatterOffsets[index##INDEX+i] / sizeof(uint32_t)]);
+ for (uint32_t i = 0; i < 32; ++i) {
+ const int index0 = rand() % 32;
+ const int index1 = rand() % 16;
+ const int index2 = rand() % 16;
+ const int index4 = rand() % 32;
+ const int index6 = rand() % 16;
+ const int index7 = rand() % 32;
+ CHECK_SCATTER_GATHER_OP(0);
+ CHECK_SCATTER_GATHER_OP(1);
+ CHECK_SCATTER_GATHER_OP(2);
+ CHECK_SCATTER_GATHER_OP(4);
+ CHECK_SCATTER_GATHER_OP(6);
+ CHECK_SCATTER_GATHER_OP(7);
+ }
+#undef CHECK_SCATTER_GATHER_OP
+
+}
+
static void utestVector(void)
{
UTEST_EXPECT_SUCCESS(utestFP());
UTEST_EXPECT_SUCCESS(utestFPCmp());
UTEST_EXPECT_SUCCESS(utestINT32Cmp());
UTEST_EXPECT_SUCCESS(utestUINT32Cmp());
+ UTEST_EXPECT_SUCCESS(utestScatterGather());
}
UTEST_REGISTER(utestVector)