const uint32_t ptrSize = unit.getPointerSize() == ir::POINTER_32_BITS ? 4u : 8u;
kernel->curbeSize = 0u;
+ // We insert the block IP mask first
+ kernel->patches.push_back(PatchInfo(GBE_CURBE_BLOCK_IP, 0, kernel->curbeSize));
+ kernel->curbeSize += this->simdWidth * sizeof(uint16_t);
+
// Go over the arguments and find the related patch locations
const uint32_t inputNum = fn.inputNum();
for (uint32_t inputID = 0u; inputID < inputNum; ++inputID) {
// Already inserted registers go here
set<ir::Register> specialRegs;
- // We insert the block IP mask first
- kernel->patches.push_back(PatchInfo(GBE_CURBE_BLOCK_IP, 0, kernel->curbeSize));
- kernel->curbeSize += this->simdWidth * sizeof(uint16_t);
-
// Then the local IDs (not scalar, so we align them properly)
kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
if (this->simdWidth == 16 || this->simdWidth == 32)
// Per-lane block IPs are always pre-allocated and used for branches. We just
// 0xffff as a fake register for them
- static const ir::Register blockIP(0xffff);
+ static const ir::Register blockIPReg(0xffff);
void GenContext::allocatePayloadReg(gbe_curbe_type value,
uint32_t subValue,
GBE_ASSERT(blockIPOffset >= 0 && blockIPOffset % GEN_REG_SIZE == 0);
blockIPOffset /= GEN_REG_SIZE;
if (simdWidth == 8)
- RA.insert(std::make_pair(blockIP, GenReg::uw8grf(blockIPOffset, 0)));
+ RA.insert(std::make_pair(blockIPReg, GenReg::uw8grf(blockIPOffset, 0)));
else if (simdWidth == 16)
- RA.insert(std::make_pair(blockIP, GenReg::uw16grf(blockIPOffset, 0)));
+ RA.insert(std::make_pair(blockIPReg, GenReg::uw16grf(blockIPOffset, 0)));
else
NOT_SUPPORTED;
p->MOV(reg(insn.getDst(0)), reg(insn.getSrc(0)));
}
+ void GenContext::emitIntMul32x32(const ir::Instruction &insn,
+ GenReg dst, GenReg src0, GenReg src1)
+ {
+
+ const uint32_t width = p->curr.execWidth;
+ const bool src0Scalar = isScalarReg(insn.getSrc(0));
+ const bool src1Scalar = isScalarReg(insn.getSrc(1));
+
+ p->push();
+
+ // Either left part of the 16-wide register or just a simd 8 register
+ dst = GenReg::retype(dst, GEN_TYPE_D);
+ src1 = GenReg::retype(src1, GEN_TYPE_D);
+ src1 = GenReg::retype(src1, GEN_TYPE_D);
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q1;
+ p->MUL(GenReg::retype(GenReg::acc(), GEN_TYPE_D), src0, src1);
+ p->MACH(GenReg::retype(GenReg::null(), GEN_TYPE_D), src0, src1);
+ p->MOV(GenReg::retype(dst, GEN_TYPE_F), GenReg::acc());
+
+ // Right part of the 16-wide register now
+ if (width == 16) {
+ p->curr.noMask = 1;
+ GenReg nextSrc0 = src0, nextSrc1 = src1;
+ if (src0Scalar == false) nextSrc0 = GenReg::next(src0);
+ if (src1Scalar == false) nextSrc1 = GenReg::next(src1);
+ p->MUL(GenReg::retype(GenReg::acc(), GEN_TYPE_D), nextSrc0, nextSrc1);
+ p->MACH(GenReg::retype(GenReg::null(), GEN_TYPE_D), nextSrc0, nextSrc1);
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ p->MOV(GenReg::f8grf(116,0), GenReg::acc());
+ p->curr.noMask = 0;
+ p->MOV(GenReg::retype(GenReg::next(dst), GEN_TYPE_F), GenReg::f8grf(116,0));
+ }
+ p->pop();
+ }
+
void GenContext::emitBinaryInstruction(const ir::BinaryInstruction &insn) {
using namespace ir;
const Opcode opcode = insn.getOpcode();
GenReg src0 = reg(insn.getSrc(0));
GenReg src1 = reg(insn.getSrc(1));
GBE_ASSERT(isScalarReg(insn.getDst(0)) == false);
- const bool src0Scalar = isScalarReg(insn.getSrc(0));
- const bool src1Scalar = isScalarReg(insn.getSrc(1));
// Default type is FLOAT
GBE_ASSERT(type == TYPE_U32 || type == TYPE_S32 || type == TYPE_FLOAT);
case OP_SUB: p->ADD(dst, src0, GenReg::negate(src1)); break;
case OP_MUL:
{
-#if 1
if (type == TYPE_FLOAT)
p->MUL(dst, src0, src1);
- else {
- const uint32_t width = p->curr.execWidth;
- p->push();
- p->curr.execWidth = 8;
- p->curr.quarterControl = GEN_COMPRESSION_Q1;
- p->MUL(GenReg::retype(GenReg::acc(), GEN_TYPE_D), src0, src1);
- p->MACH(GenReg::retype(GenReg::null(), GEN_TYPE_D), src0, src1);
- p->MOV(dst, GenReg::retype(GenReg::acc(), GEN_TYPE_D));
- if (width == 16) {
- p->curr.noMask = 1;
- GenReg nextSrc0 = src0, nextSrc1 = src1;
- if (src0Scalar == false) nextSrc0 = GenReg::next(src0);
- if (src1Scalar == false) nextSrc1 = GenReg::next(src1);
- p->MUL(GenReg::retype(GenReg::acc(), GEN_TYPE_D), nextSrc0, nextSrc1);
- p->MACH(GenReg::retype(GenReg::null(), GEN_TYPE_D), nextSrc0, nextSrc1);
- p->curr.quarterControl = GEN_COMPRESSION_Q2;
- p->MOV(GenReg::d8grf(116,0), GenReg::retype(GenReg::acc(), GEN_TYPE_D));
- p->curr.noMask = 0;
- p->MOV(GenReg::next(dst), GenReg::d8grf(116,0));
- }
- p->pop();
-
- }
-#endif
+ else if (type == TYPE_U32 || type == TYPE_S32)
+ this->emitIntMul32x32(insn, dst, src0, src1);
+ else
+ NOT_IMPLEMENTED;
break;
}
default: NOT_IMPLEMENTED;
GBE_ASSERT(this->simdWidth <= 16);
const GenReg address = reg(insn.getAddress());
const GenReg value = reg(insn.getValue(0));
- // XXX remove that later. Now we just copy everything to GRFs to make it
- // contiguous
if (this->simdWidth == 8 || this->simdWidth == 16)
p->UNTYPED_READ(value, address, 0, 1);
else
GBE_ASSERT(it != RA.end());
return it->second;
}
+
/*! Emit instruction per family */
void emitUnaryInstruction(const ir::UnaryInstruction &insn);
void emitBinaryInstruction(const ir::BinaryInstruction &insn);
void emitStoreInstruction(const ir::StoreInstruction &insn);
void emitFenceInstruction(const ir::FenceInstruction &insn);
void emitLabelInstruction(const ir::LabelInstruction &insn);
+ /*! It is not natively suppored on Gen. We implement it here */
+ void emitIntMul32x32(const ir::Instruction &insn, GenReg dst, GenReg src0, GenReg src1);
/*! Implements base class */
virtual Kernel *allocateKernel(void);
/*! Simplistic allocation to start with */
* Conditional Modifier for most instructions. On Gen6+, this is also
* used for the SEND instruction's Message Target/SFID.
*/
- uint32_t destreg__conditionalmod:4;
+ uint32_t destreg_or_condmod:4;
uint32_t acc_wr_control:1;
uint32_t cmpt_control:1;
uint32_t debug_control:1;
* Author: Benjamin Segovia <benjamin.segovia@intel.com>
*/
- /*
- * Authors:
- * Keith Whitwell <keith@tungstengraphics.com>
- */
+/**
+ * \file gen_eu.hpp
+ * \author Benjamin Segovia <benjamin.segovia@intel.com>
+ * This is a revamped Gen ISA encoder from Mesa code base
+ */
#include "backend/gen_eu.hpp"
#include <cstring>
this->curr.execWidth = simdWidth;
this->curr.quarterControl = GEN_COMPRESSION_Q1;
this->curr.noMask = 0;
+ this->curr.predicated = 1;
+ this->curr.flag = 0;
+ this->curr.inversePredicate = 0;
}
- void GenEmitter::setExecutionWidth(GenInstruction *insn) {
+ void GenEmitter::setHeader(GenInstruction *insn) {
if (this->curr.execWidth == 8)
insn->header.execution_size = GEN_WIDTH_8;
else if (this->curr.execWidth == 16)
insn->header.execution_size = GEN_WIDTH_16;
else
- GBE_ASSERT(0);
- }
- void GenEmitter::setQuarterControl(GenInstruction *insn) {
+ NOT_IMPLEMENTED;
insn->header.quarter_control = this->curr.quarterControl;
- }
- void GenEmitter::setNoMask(GenInstruction *insn) {
insn->header.mask_control = this->curr.noMask;
- }
- void GenEmitter::setHeader(GenInstruction *insn) {
- this->setExecutionWidth(insn);
- this->setQuarterControl(insn);
- this->setNoMask(insn);
+ if (this->curr.predicated) {
+ insn->header.predicate_control = GEN_PREDICATE_NORMAL;
+ insn->header.predicate_inverse = this->curr.inversePredicate;
+ insn->bits2.da1.flag_reg_nr = this->curr.flag;
+ }
}
/* Returns the corresponding conditional mod for swapping src0 and
inst->bits3.generic_gen5.response_length = response_length;
inst->bits3.generic_gen5.msg_length = msg_length;
inst->bits3.generic_gen5.end_of_thread = end_of_thread;
- inst->header.destreg__conditionalmod = sfid;
+ inst->header.destreg_or_condmod = sfid;
}
void
assert(elemNum >= 1 || elemNum <= 4);
uint32_t msg_length = 0;
uint32_t response_length = 0;
- if (this->curr.execWidth == 8)
+ this->setHeader(insn);
+ if (this->curr.execWidth == 8) {
+ this->setDst(insn, GenReg::retype(GenReg::null(), GEN_TYPE_UD));
msg_length = 1+elemNum;
- else if (this->curr.execWidth == 16)
+ } else if (this->curr.execWidth == 16) {
+ this->setDst(insn, GenReg::retype(GenReg::null(), GEN_TYPE_UW));
msg_length = 2*(1+elemNum);
+ }
else
NOT_IMPLEMENTED;
- this->setHeader(insn);
- this->setDst(insn, GenReg::retype(GenReg::null(), GEN_TYPE_UW));
this->setSrc0(insn, GenReg::ud8grf(msg.nr, 0));
this->setSrc1(insn, GenReg::immud(0));
set_dp_untyped_rw(this,
{
GenInstruction *insn = this->next(GEN_OPCODE_CMP);
- insn->header.destreg__conditionalmod = conditional;
+ insn->header.destreg_or_condmod = conditional;
this->setHeader(insn);
this->setDst(insn, dest);
this->setSrc0(insn, src0);
} else
assert(src.type == GEN_TYPE_F);
- insn->header.destreg__conditionalmod = function;
+ insn->header.destreg_or_condmod = function;
insn->header.saturate = saturate;
this->setDst(insn, dest);
this->setSrc0(insn, src);
assert(src1.type == GEN_TYPE_F);
}
- insn->header.destreg__conditionalmod = function;
+ insn->header.destreg_or_condmod = function;
this->setHeader(insn);
this->setDst(insn, dest);
this->setSrc0(insn, src0);
/* Math is the same ISA format as other opcodes, except that CondModifier
* becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
*/
- insn->header.destreg__conditionalmod = function;
+ insn->header.destreg_or_condmod = function;
insn->header.saturate = saturate;
/* Source modifiers are ignored for extended math instructions. */
insn->bits3.spawner_gen5.resource = GEN_DO_NOT_DEREFERENCE_URB;
insn->bits3.spawner_gen5.msg_length = 1;
insn->bits3.spawner_gen5.end_of_thread = 1;
- insn->header.destreg__conditionalmod = GEN_SFID_THREAD_SPAWNER;
+ insn->header.destreg_or_condmod = GEN_SFID_THREAD_SPAWNER;
}
} /* namespace gbe */
* Author: Benjamin Segovia <benjamin.segovia@intel.com>
*/
- /*
- * Authors:
- * Keith Whitwell <keith@tungstengraphics.com>
- */
+/**
+ * \file gen_eu.hpp
+ * \author Benjamin Segovia <benjamin.segovia@intel.com>
+ * This is a revamped Gen ISA encoder from Mesa code base
+ */
+
#ifndef GEN_EU_H
#define GEN_EU_H
uint32_t execWidth:6;
uint32_t quarterControl:2;
uint32_t noMask:1;
+ uint32_t predicated:1;
+ uint32_t flag:1;
+ uint32_t inversePredicate:1;
};
/*! Helper structure to emit Gen instructions */
// Helper functions to encode
////////////////////////////////////////////////////////////////////////
void setHeader(GenInstruction *insn);
- void setExecutionWidth(GenInstruction *insn);
- void setQuarterControl(GenInstruction *insn);
- void setNoMask(GenInstruction *insn);
void setDst(GenInstruction *insn, GenReg dest);
void setSrc0(GenInstruction *insn, GenReg reg);
void setSrc1(GenInstruction *insn, GenReg reg);
return shuffle<index, index, index, index>(b);
}
-/*! Base structure for scalar double word */
+/*! Base structure for scalar double word (32 bits) */
union scalar_dw {
INLINE scalar_dw(void) {}
INLINE scalar_dw(uint32_t u) { this->u = u; }
uint32_t u; int32_t s; float f;
};
+/*! Base structure for scalar word (16 bits) */
+union scalar_w {
+ INLINE scalar_w(void) {}
+ INLINE scalar_w(uint16_t u) { this->u = u; }
+ INLINE scalar_w(int16_t s) { this->s = s; }
+ INLINE float toFloat(void) const {
+ union {uint16_t u[2]; float f;} x;
+ x.u[0] = u;
+ x.u[1] = 0;
+ return x.f;
+ }
+ uint16_t u; int16_t s;
+};
+
/*! Base structure for scalar mask */
union scalar_m { uint32_t u; int32_t s; float f; };
__m128 m[vectorNum];
};
+/*! Base structure for vectors 4 / 8 / 16 / 32 words. We do not store 8 shorts
+ * but only 4. This makes everything much simpler even if it is clearly slower
+ */
+template <uint32_t vectorNum>
+struct simd_w {
+ INLINE simd_w(void) {}
+ INLINE simd_w(const scalar_w &s) {
+ const float f = s.toFloat();
+ for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&f);
+ }
+ simd_w &operator= (const scalar_w &s) {
+ const float f = s.toFloat();
+ for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&f);
+ return *this;
+ }
+ __m128 m[vectorNum];
+};
+
/*! Base structure for 4 / 8 / 16 / 32 booleans (m stands for "mask") */
template <uint32_t vectorNum>
struct simd_m {
dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);
}
-/*! To cast through memory */
+/*! To cast through memory 32 bits values in sse registers */
union cast_dw {
INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {
u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3;
int32_t s[4];
float f[4];
};
-static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+static const cast_dw allTrue32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+
+/*! To cast through memory 16 bits values in sse registers */
+union cast_w {
+ INLINE cast_w(int16_t s0, int16_t s1, int16_t s2, int16_t s3) {
+ s[0].v = s0; s[1].v = s1; s[2].v = s2; s[3].v = s3;
+ s[0].pad = s[1].pad = s[2].pad = s[3].pad = 0;
+ }
+ INLINE cast_w(uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3) {
+ u[0].v = u0; u[1].v = u1; u[2].v = u2; u[3].v = u3;
+ u[0].pad = u[1].pad = u[2].pad = u[3].pad = 0;
+ }
+ INLINE cast_w(const __m128 &v) : v(v) {}
+ INLINE cast_w(const __m128i &vi) : vi(vi) {}
+ INLINE cast_w(void) {}
+ __m128 v;
+ __m128i vi;
+ struct { uint16_t v; uint16_t pad; } u[4];
+ struct { int16_t v; int16_t pad; } s[4];
+};
/*! Make a mask true */
template <uint32_t vectorNum>
-INLINE void alltrueMask(simd_m<vectorNum> &x) {
- for (uint32_t i = 0; i < vectorNum; ++i) x.m[i] = alltrue.v;
+INLINE void allTrue32Mask(simd_m<vectorNum> &x) {
+ for (uint32_t i = 0; i < vectorNum; ++i) x.m[i] = allTrue32.v;
}
/* Some convenient typedefs */
typedef simd_dw<2> simd8dw;
typedef simd_dw<4> simd16dw;
typedef simd_dw<8> simd32dw;
+typedef scalar_w simd1w;
+typedef simd_w<1> simd4w;
+typedef simd_w<2> simd8w;
+typedef simd_w<4> simd16w;
+typedef simd_w<8> simd32w;
typedef scalar_m simd1m;
typedef simd_m<1> simd4m;
typedef simd_m<2> simd8m;
/* Simple function to get the number of element per vector */
template <uint32_t vectorNum>
INLINE uint32_t elemNum(const simd_dw<vectorNum> &x) {
- return 4 * vectorNum;
+ return 4*vectorNum;
}
template <uint32_t vectorNum>
INLINE uint32_t elemNum(const simd_m<vectorNum> &x) {
- return 4 * vectorNum;
+ return 4*vectorNum;
+}
+template <uint32_t vectorNum>
+INLINE uint32_t elemNum(const simd_w<vectorNum> &x) {
+ return 4*vectorNum;
}
/* Build an integer mask from the mask vectors */
const __m128 v = _mm_load1_ps(&x.f);
for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v;
}
+template <uint32_t vectorNum>
+INLINE void MOV_S16(simd_w<vectorNum> &dst, const simd_w<vectorNum> &v) {
+ for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v.m[i];
+}
+template <uint32_t vectorNum>
+INLINE void MOV_S16(simd_w<vectorNum> &dst, const scalar_w &x) {
+ const float f = x.toFloat();
+ const __m128 v = _mm_load1_ps(&f);
+ for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v;
+}
/* Vector instructions that use sse* */
-#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\
+#define VEC_OP(DST_TYPE, SRC_TYPE, SCALAR_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\
template <uint32_t vectorNum>\
INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\
for (uint32_t i = 0; i < vectorNum; ++i)\
dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i])));\
}\
template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\
- NAME(dst, v0, simd_dw<vectorNum>(v1));\
+INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SCALAR_TYPE &v1) {\
+ NAME(dst, v0, SRC_TYPE(v1));\
}\
template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\
- NAME(dst, simd_dw<vectorNum>(v0), v1);\
+INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SRC_TYPE &v1) {\
+ NAME(dst, SRC_TYPE(v0), v1);\
}\
template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const scalar_dw &v1) {\
- NAME(dst, simd_dw<vectorNum>(v0), simd_dw<vectorNum>(v1));\
-}
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, ADD_F, _mm_add_ps, ID, ID, ID);
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, SUB_F, _mm_sub_ps, ID, ID, ID);
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, MUL_F, _mm_mul_ps, ID, ID, ID);
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, DIV_F, _mm_div_ps, ID, ID, ID);
-VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, EQ_F, _mm_cmpeq_ps, ID, ID, ID);
-VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, NE_F, _mm_cmpneq_ps, ID, ID, ID);
-VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LT_F, _mm_cmplt_ps, ID, ID, ID);
-VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LE_F, _mm_cmple_ps, ID, ID, ID);
-VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GT_F, _mm_cmpgt_ps, ID, ID, ID);
-VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GE_F, _mm_cmpge_ps, ID, ID, ID);
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, ADD_S32, _mm_add_epi32, SI2PS, PS2SI, PS2SI);
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, SUB_S32, _mm_sub_epi32, SI2PS, PS2SI, PS2SI);
-VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, EQ_S32, _mm_cmpeq_epi32, SI2PS, PS2SI, PS2SI);
-VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LT_S32, _mm_cmplt_epi32, SI2PS, PS2SI, PS2SI);
-VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GT_S32, _mm_cmpgt_epi32, SI2PS, PS2SI, PS2SI);
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, OR_S32, _mm_or_ps, ID, ID, ID);
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, XOR_S32, _mm_xor_ps, ID, ID, ID);
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, AND_S32, _mm_and_ps, ID, ID, ID);
+INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SCALAR_TYPE &v1) {\
+ NAME(dst, SRC_TYPE(v0), SRC_TYPE(v1));\
+}
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, ADD_F, _mm_add_ps, ID, ID, ID);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, SUB_F, _mm_sub_ps, ID, ID, ID);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, MUL_F, _mm_mul_ps, ID, ID, ID);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, DIV_F, _mm_div_ps, ID, ID, ID);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, EQ_F, _mm_cmpeq_ps, ID, ID, ID);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, NE_F, _mm_cmpneq_ps, ID, ID, ID);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, LT_F, _mm_cmplt_ps, ID, ID, ID);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, LE_F, _mm_cmple_ps, ID, ID, ID);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, GT_F, _mm_cmpgt_ps, ID, ID, ID);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, GE_F, _mm_cmpge_ps, ID, ID, ID);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, ADD_S32, _mm_add_epi32, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, SUB_S32, _mm_sub_epi32, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, EQ_S32, _mm_cmpeq_epi32, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, LT_S32, _mm_cmplt_epi32, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, GT_S32, _mm_cmpgt_epi32, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, AND_S32, _mm_and_ps, ID, ID, ID);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, OR_S32, _mm_or_ps, ID, ID, ID);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, XOR_S32, _mm_xor_ps, ID, ID, ID);
+VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, EQ_S16, _mm_cmpeq_epi32, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, ADD_S16, _mm_add_epi16, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, SUB_S16, _mm_sub_epi16, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, AND_S16, _mm_and_ps, ID, ID, ID);
+VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, OR_S16, _mm_or_ps, ID, ID, ID);
+VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, XOR_S16, _mm_xor_ps, ID, ID, ID);
#undef VEC_OP
/* Vector integer operations that we can get by switching argument order */
template <uint32_t vectorNum>\
INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\
for (uint32_t i = 0; i < vectorNum; ++i)\
- dst.m[i] = _mm_xor_ps(FN(INTRINSIC_NAME(FN1(v0.m[i]), FN0(v1.m[i]))), alltrue.v);\
+ dst.m[i] = _mm_xor_ps(FN(INTRINSIC_NAME(FN1(v0.m[i]), FN0(v1.m[i]))), allTrue32.v);\
}\
template <uint32_t vectorNum>\
INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\
#undef VEC_OP
/* Vector binary integer operations that require C */
-#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, OP, FIELD)\
+#define VEC_OP(DST_TYPE, SRC_TYPE, SCALAR_TYPE, CAST_TYPE, NAME, OP, FIELD)\
template <uint32_t vectorNum>\
INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\
for (uint32_t i = 0; i < vectorNum; ++i) {\
- cast_dw c0(v0.m[i]), c1(v1.m[i]), d;\
+ CAST_TYPE c0(v0.m[i]), c1(v1.m[i]), d;\
for (uint32_t j = 0; j < 4; ++j)\
- d.FIELD[j] = c0.FIELD[j] OP c1.FIELD[j];\
+ d.FIELD = c0.FIELD OP c1.FIELD;\
dst.m[i] = d.v;\
}\
}\
template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\
- NAME(dst, v0, simd_dw<vectorNum>(v1));\
+INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SCALAR_TYPE &v1) {\
+ NAME(dst, v0, SRC_TYPE(v1));\
}\
template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\
- NAME(dst, simd_dw<vectorNum>(v0), v1);\
+INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SRC_TYPE &v1) {\
+ NAME(dst, SRC_TYPE(v0), v1);\
}\
template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const scalar_dw &v1) {\
- NAME(dst, simd_dw<vectorNum>(v0), simd_dw<vectorNum>(v1));\
-}
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, MUL_S32, *, s);
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, DIV_S32, /, s);
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, REM_S32, %, s);
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, MUL_U32, *, u);
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, DIV_U32, /, u);
-VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, REM_U32, %, u);
+INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SCALAR_TYPE &v1) {\
+ NAME(dst, SRC_TYPE(v0), SRC_TYPE(v1));\
+}
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, MUL_S32, *, s[j]);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, DIV_S32, /, s[j]);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, REM_S32, %, s[j]);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, MUL_U32, *, u[j]);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, DIV_U32, /, u[j]);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, REM_U32, %, u[j]);
+VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, MUL_S16, *, s[j].v);
+VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, DIV_S16, /, s[j].v);
+VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, REM_S16, %, s[j].v);
+VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, MUL_U16, *, u[j].v);
+VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, DIV_U16, /, u[j].v);
+VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, REM_U16, %, u[j].v);
#undef VEC_OP
/* Vector compare vectors that require C */
-#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, OP, FIELD)\
+#define VEC_OP(DST_TYPE, SRC_TYPE, SCALAR_TYPE, CAST_TYPE, NAME, OP, FIELD)\
template <uint32_t vectorNum>\
INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\
for (uint32_t i = 0; i < vectorNum; ++i) {\
- cast_dw c0(v0.m[i]), c1(v1.m[i]), d;\
+ CAST_TYPE c0(v0.m[i]), c1(v1.m[i]);\
+ cast_dw d;\
for (uint32_t j = 0; j < 4; ++j)\
- d.u[j] = (c0.FIELD[j] OP c1.FIELD[j]) ? ~0u : 0u;\
+ d.u[j] = (c0.FIELD OP c1.FIELD) ? ~0u : 0u;\
dst.m[i] = d.v;\
}\
}\
template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\
- for (uint32_t i = 0; i < vectorNum; ++i) {\
- cast_dw c0(v0.m[i]), d;\
- for (uint32_t j = 0; j < 4; ++j)\
- d.u[j] = (c0.FIELD[j] OP v1.FIELD) ? ~0u : 0u;\
- dst.m[i] = d.v;\
- }\
+INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SCALAR_TYPE &v1) {\
+ NAME(dst, v0, SRC_TYPE(v1));\
}\
template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\
- for (uint32_t i = 0; i < vectorNum; ++i) {\
- cast_dw c1(v1.m[i]), d;\
- for (uint32_t j = 0; j < 4; ++j)\
- d.u[j] = (v0.FIELD OP c1.FIELD[j]) ? ~0u : 0u;\
- dst.m[i] = d.v;\
- }\
-}
-VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LE_U32, <=, u);
-VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LT_U32, <, u);
-VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GE_U32, >=, u);
-VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GT_U32, >, u);
+INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SRC_TYPE &v1) {\
+ NAME(dst, SRC_TYPE(v0), v1);\
+}\
+template <uint32_t vectorNum>\
+INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SCALAR_TYPE &v1) {\
+ NAME(dst, SRC_TYPE(v0), SRC_TYPE(v1));\
+}
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, LE_U32, <=, u[j]);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, LT_U32, <, u[j]);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, GE_U32, >=, u[j]);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, GT_U32, >, u[j]);
+VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, LE_U16, <=, u[j].v);
+VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, LT_U16, <, u[j].v);
+VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, GE_U16, >=, u[j].v);
+VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, GT_U16, >, u[j].v);
+VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, LE_S16, <=, s[j].v);
+VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, LT_S16, <, s[j].v);
+VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, GE_S16, >=, s[j].v);
+VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, GT_S16, >, s[j].v);
#undef VEC_OP
+/* Get NE from EQ */
template <uint32_t vectorNum>
INLINE void NE_S32(simd_m<vectorNum> &dst,
const simd_dw<vectorNum> &v0,
const simd_dw<vectorNum> &v1)
{
for (uint32_t i = 0; i < vectorNum; ++i)
- dst.m[i] = _mm_xor_ps(alltrue.v, SI2PS(_mm_cmpeq_epi32(PS2SI(v0.m[i]), PS2SI(v1.m[i]))));
+ dst.m[i] = _mm_xor_ps(allTrue32.v, SI2PS(_mm_cmpeq_epi32(PS2SI(v0.m[i]), PS2SI(v1.m[i]))));
}
template <uint32_t vectorNum>
INLINE void NE_S32(simd_m<vectorNum> &dst,
{
NE_S32(dst, simd_dw<vectorNum>(v0), v1);
}
+template <uint32_t vectorNum>
+INLINE void NE_S16(simd_m<vectorNum> &dst,
+ const simd_w<vectorNum> &v0,
+ const simd_w<vectorNum> &v1)
+{
+ for (uint32_t i = 0; i < vectorNum; ++i)
+ dst.m[i] = _mm_xor_ps(allTrue32.v, SI2PS(_mm_cmpeq_epi32(PS2SI(v0.m[i]), PS2SI(v1.m[i]))));
+}
+template <uint32_t vectorNum>
+INLINE void NE_S16(simd_m<vectorNum> &dst,
+ const simd_w<vectorNum> &v0,
+ const scalar_w &v1)
+{
+ NE_S16(dst, v0, simd_w<vectorNum>(v1));
+}
+template <uint32_t vectorNum>
+INLINE void NE_S16(simd_m<vectorNum> &dst,
+ const scalar_w &v0,
+ const simd_w<vectorNum> &v1)
+{
+ NE_S16(dst, simd_w<vectorNum>(v0), v1);
+}
+template <uint32_t vectorNum>
+INLINE void NE_S16(simd_m<vectorNum> &dst,
+ const scalar_w &v0,
+ const scalar_w &v1)
+{
+ NE_S16(dst, simd_w<vectorNum>(v0), simd_w<vectorNum>(v1));
+}
/* Load from contiguous double words */
template <uint32_t vectorNum>
_mm_storeu_ps((float*) ptr + 4*i, src.m[i]);
}
+/* Load from contiguous words */
+template <uint32_t vectorNum>
+INLINE void LOAD(simd_w<vectorNum> &dst, const char *ptr) {
+ for (uint32_t i = 0; i < vectorNum; ++i) {
+ const uint16_t u0 = *((uint16_t*) ptr + 4*i + 0);
+ const uint16_t u1 = *((uint16_t*) ptr + 4*i + 1);
+ const uint16_t u2 = *((uint16_t*) ptr + 4*i + 2);
+ const uint16_t u3 = *((uint16_t*) ptr + 4*i + 3);
+ const cast_w w(u0,u1,u2,u3);
+ dst.m[i] = w.v;
+ }
+}
+
+/* Store to contiguous words */
+template <uint32_t vectorNum>
+INLINE void STORE(const simd_w<vectorNum> &src, char *ptr) {
+ for (uint32_t i = 0; i < vectorNum; ++i) {
+ const cast_w w(src.m[i]);
+ *((uint16_t*) ptr + 4*i + 0) = w.u[0].v;
+ *((uint16_t*) ptr + 4*i + 1) = w.u[1].v;
+ *((uint16_t*) ptr + 4*i + 2) = w.u[2].v;
+ *((uint16_t*) ptr + 4*i + 3) = w.u[3].v;
+ }
+}
+
/* Load immediates */
template <uint32_t vectorNum>
INLINE void LOADI(simd_dw<vectorNum> &dst, uint32_t u) {
// Scalar instructions
//////////////////////////////////////////////////////////////////////////////
INLINE uint32_t elemNum(const scalar_dw &x) { return 1; }
+INLINE uint32_t elemNum(const scalar_w &x) { return 1; }
INLINE uint32_t elemNum(const scalar_m &x) { return 1; }
INLINE uint32_t mask(const scalar_m &v) { return v.u ? 1 : 0; }
+
+// 32 bit floating points
INLINE void ADD_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f + v1.f; }
INLINE void SUB_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f - v1.f; }
INLINE void MUL_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f * v1.f; }
INLINE void LT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f < v1.f ? ~0 : 0); }
INLINE void GE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f >= v1.f ? ~0 : 0); }
INLINE void GT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f > v1.f ? ~0 : 0); }
+
+// 32 bit integers
INLINE void ADD_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s + v1.s; }
INLINE void SUB_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s - v1.s; }
INLINE void MUL_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s * v1.s; }
INLINE void SCATTER(scalar_dw offset, scalar_dw value, char *base) { *(uint32_t*)(base + offset.u) = value.u; }
INLINE void GATHER(scalar_dw &dst, scalar_dw offset, const char *base) { dst.u = *(const uint32_t*)(base + offset.u); }
+// 16 bit floating points
+INLINE void ADD_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u + v1.u; }
+INLINE void SUB_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u - v1.u; }
+INLINE void ADD_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s + v1.s; }
+INLINE void SUB_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s - v1.s; }
+INLINE void MUL_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s * v1.s; }
+INLINE void DIV_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s / v1.s; }
+INLINE void REM_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s % v1.s; }
+INLINE void MUL_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u * v1.u; }
+INLINE void DIV_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u / v1.u; }
+INLINE void REM_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u % v1.u; }
+INLINE void EQ_S16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.s == v1.s ? ~0 : 0); }
+INLINE void NE_S16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.s != v1.s ? ~0 : 0); }
+INLINE void LE_S16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.s <= v1.s ? ~0 : 0); }
+INLINE void LT_S16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.s < v1.s ? ~0 : 0); }
+INLINE void GE_S16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.s >= v1.s ? ~0 : 0); }
+INLINE void GT_S16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.s > v1.s ? ~0 : 0); }
+INLINE void XOR_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s ^ v1.s; }
+INLINE void OR_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s | v1.s; }
+INLINE void AND_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s & v1.s; }
+INLINE void LE_U16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.u <= v1.u ? ~0 : 0); }
+INLINE void LT_U16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.u < v1.u ? ~0 : 0); }
+INLINE void GE_U16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.u >= v1.u ? ~0 : 0); }
+INLINE void GT_U16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.u > v1.u ? ~0 : 0); }
+INLINE void LOAD(scalar_w &dst, const char *ptr) { dst.u = *(const uint16_t *) ptr; }
+INLINE void STORE(scalar_w src, char *ptr) { *(uint16_t *) ptr = src.u; }
+INLINE void LOADI(scalar_w &dst, uint16_t u) { dst.u = u; }
+INLINE void SCATTER(scalar_w offset, scalar_w value, char *base) { *(uint16_t*)(base + offset.u) = value.u; }
+INLINE void GATHER(scalar_w &dst, scalar_w offset, const char *base) { dst.u = *(const uint16_t*)(base + offset.u); }
+
//////////////////////////////////////////////////////////////////////////////
// Identical instructions are forwarded
//////////////////////////////////////////////////////////////////////////////
+// Forward identical 32 bit instructions
#define NOV_U32 MOV_S32
#define NOV_F MOV_S32
#define ADD_U32 ADD_S32
#define EQ_U32 EQ_S32
#define NE_U32 NE_S32
+// Forward identical 16 bit instructions
+#define NOV_U16 MOV_S16
+#define ADD_U16 ADD_S16
+#define SUB_U16 SUB_S16
+#define AND_U16 AND_S16
+#define XOR_U16 XOR_S16
+#define OR_U16 OR_S16
+#define AND_U16 AND_S16
+#define EQ_U16 EQ_S16
+#define NE_U16 NE_S16
+
#undef PS2SI
#undef SI2PS
#undef ID
" return shuffle<index, index, index, index>(b);\n"
"}\n"
"\n"
-"/*! Base structure for scalar double word */\n"
+"/*! Base structure for scalar double word (32 bits) */\n"
"union scalar_dw {\n"
" INLINE scalar_dw(void) {}\n"
" INLINE scalar_dw(uint32_t u) { this->u = u; }\n"
" uint32_t u; int32_t s; float f;\n"
"};\n"
"\n"
+"/*! Base structure for scalar word (16 bits) */\n"
+"union scalar_w {\n"
+" INLINE scalar_w(void) {}\n"
+" INLINE scalar_w(uint16_t u) { this->u = u; }\n"
+" INLINE scalar_w(int16_t s) { this->s = s; }\n"
+" INLINE float toFloat(void) const {\n"
+" union {uint16_t u[2]; float f;} x;\n"
+" x.u[0] = u;\n"
+" x.u[1] = 0;\n"
+" return x.f;\n"
+" }\n"
+" uint16_t u; int16_t s;\n"
+"};\n"
+"\n"
"/*! Base structure for scalar mask */\n"
"union scalar_m { uint32_t u; int32_t s; float f; };\n"
"\n"
" __m128 m[vectorNum];\n"
"};\n"
"\n"
+"/*! Base structure for vectors 4 / 8 / 16 / 32 words. We do not store 8 shorts\n"
+" * but only 4. This makes everything much simpler even if it is clearly slower\n"
+" */\n"
+"template <uint32_t vectorNum>\n"
+"struct simd_w {\n"
+" INLINE simd_w(void) {}\n"
+" INLINE simd_w(const scalar_w &s) {\n"
+" const float f = s.toFloat();\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&f);\n"
+" }\n"
+" simd_w &operator= (const scalar_w &s) {\n"
+" const float f = s.toFloat();\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&f);\n"
+" return *this;\n"
+" }\n"
+" __m128 m[vectorNum];\n"
+"};\n"
+"\n"
"/*! Base structure for 4 / 8 / 16 / 32 booleans (m stands for \"mask\") */\n"
"template <uint32_t vectorNum>\n"
"struct simd_m {\n"
" dst.m[i] = _mm_blendv_ps(src0.m[i], src1.m[i], mask.m[i]);\n"
"}\n"
"\n"
-"/*! To cast through memory */\n"
+"/*! To cast through memory 32 bits values in sse registers */\n"
"union cast_dw {\n"
" INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {\n"
" u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3;\n"
" int32_t s[4];\n"
" float f[4];\n"
"};\n"
-"static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\n"
+"static const cast_dw allTrue32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\n"
+"\n"
+"/*! To cast through memory 16 bits values in sse registers */\n"
+"union cast_w {\n"
+" INLINE cast_w(int16_t s0, int16_t s1, int16_t s2, int16_t s3) {\n"
+" s[0].v = s0; s[1].v = s1; s[2].v = s2; s[3].v = s3;\n"
+" s[0].pad = s[1].pad = s[2].pad = s[3].pad = 0;\n"
+" }\n"
+" INLINE cast_w(uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3) {\n"
+" u[0].v = u0; u[1].v = u1; u[2].v = u2; u[3].v = u3;\n"
+" u[0].pad = u[1].pad = u[2].pad = u[3].pad = 0;\n"
+" }\n"
+" INLINE cast_w(const __m128 &v) : v(v) {}\n"
+" INLINE cast_w(const __m128i &vi) : vi(vi) {}\n"
+" INLINE cast_w(void) {}\n"
+" __m128 v;\n"
+" __m128i vi;\n"
+" struct { uint16_t v; uint16_t pad; } u[4];\n"
+" struct { int16_t v; int16_t pad; } s[4];\n"
+"};\n"
"\n"
"/*! Make a mask true */\n"
"template <uint32_t vectorNum>\n"
-"INLINE void alltrueMask(simd_m<vectorNum> &x) {\n"
-" for (uint32_t i = 0; i < vectorNum; ++i) x.m[i] = alltrue.v;\n"
+"INLINE void allTrue32Mask(simd_m<vectorNum> &x) {\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) x.m[i] = allTrue32.v;\n"
"}\n"
"\n"
"/* Some convenient typedefs */\n"
"typedef simd_dw<2> simd8dw;\n"
"typedef simd_dw<4> simd16dw;\n"
"typedef simd_dw<8> simd32dw;\n"
+"typedef scalar_w simd1w;\n"
+"typedef simd_w<1> simd4w;\n"
+"typedef simd_w<2> simd8w;\n"
+"typedef simd_w<4> simd16w;\n"
+"typedef simd_w<8> simd32w;\n"
"typedef scalar_m simd1m;\n"
"typedef simd_m<1> simd4m;\n"
"typedef simd_m<2> simd8m;\n"
"/* Simple function to get the number of element per vector */\n"
"template <uint32_t vectorNum>\n"
"INLINE uint32_t elemNum(const simd_dw<vectorNum> &x) {\n"
-" return 4 * vectorNum;\n"
+" return 4*vectorNum;\n"
"}\n"
"template <uint32_t vectorNum>\n"
"INLINE uint32_t elemNum(const simd_m<vectorNum> &x) {\n"
-" return 4 * vectorNum;\n"
+" return 4*vectorNum;\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE uint32_t elemNum(const simd_w<vectorNum> &x) {\n"
+" return 4*vectorNum;\n"
"}\n"
"\n"
"/* Build an integer mask from the mask vectors */\n"
" const __m128 v = _mm_load1_ps(&x.f);\n"
" for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v;\n"
"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MOV_S16(simd_w<vectorNum> &dst, const simd_w<vectorNum> &v) {\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v.m[i];\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void MOV_S16(simd_w<vectorNum> &dst, const scalar_w &x) {\n"
+" const float f = x.toFloat();\n"
+" const __m128 v = _mm_load1_ps(&f);\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) dst.m[i] = v;\n"
+"}\n"
"\n"
"/* Vector instructions that use sse* */\n"
-"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\\n"
+"#define VEC_OP(DST_TYPE, SRC_TYPE, SCALAR_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\\n"
"template <uint32_t vectorNum>\\\n"
"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\\\n"
" for (uint32_t i = 0; i < vectorNum; ++i)\\\n"
" dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i])));\\\n"
"}\\\n"
"template <uint32_t vectorNum>\\\n"
-"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\\\n"
-" NAME(dst, v0, simd_dw<vectorNum>(v1));\\\n"
+"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SCALAR_TYPE &v1) {\\\n"
+" NAME(dst, v0, SRC_TYPE(v1));\\\n"
"}\\\n"
"template <uint32_t vectorNum>\\\n"
-"INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\\\n"
-" NAME(dst, simd_dw<vectorNum>(v0), v1);\\\n"
+"INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SRC_TYPE &v1) {\\\n"
+" NAME(dst, SRC_TYPE(v0), v1);\\\n"
"}\\\n"
"template <uint32_t vectorNum>\\\n"
-"INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const scalar_dw &v1) {\\\n"
-" NAME(dst, simd_dw<vectorNum>(v0), simd_dw<vectorNum>(v1));\\\n"
-"}\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, ADD_F, _mm_add_ps, ID, ID, ID);\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, SUB_F, _mm_sub_ps, ID, ID, ID);\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, MUL_F, _mm_mul_ps, ID, ID, ID);\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, DIV_F, _mm_div_ps, ID, ID, ID);\n"
-"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, EQ_F, _mm_cmpeq_ps, ID, ID, ID);\n"
-"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, NE_F, _mm_cmpneq_ps, ID, ID, ID);\n"
-"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LT_F, _mm_cmplt_ps, ID, ID, ID);\n"
-"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LE_F, _mm_cmple_ps, ID, ID, ID);\n"
-"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GT_F, _mm_cmpgt_ps, ID, ID, ID);\n"
-"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GE_F, _mm_cmpge_ps, ID, ID, ID);\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, ADD_S32, _mm_add_epi32, SI2PS, PS2SI, PS2SI);\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, SUB_S32, _mm_sub_epi32, SI2PS, PS2SI, PS2SI);\n"
-"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, EQ_S32, _mm_cmpeq_epi32, SI2PS, PS2SI, PS2SI);\n"
-"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LT_S32, _mm_cmplt_epi32, SI2PS, PS2SI, PS2SI);\n"
-"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GT_S32, _mm_cmpgt_epi32, SI2PS, PS2SI, PS2SI);\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, OR_S32, _mm_or_ps, ID, ID, ID);\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, XOR_S32, _mm_xor_ps, ID, ID, ID);\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, AND_S32, _mm_and_ps, ID, ID, ID);\n"
+"INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SCALAR_TYPE &v1) {\\\n"
+" NAME(dst, SRC_TYPE(v0), SRC_TYPE(v1));\\\n"
+"}\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, ADD_F, _mm_add_ps, ID, ID, ID);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, SUB_F, _mm_sub_ps, ID, ID, ID);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, MUL_F, _mm_mul_ps, ID, ID, ID);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, DIV_F, _mm_div_ps, ID, ID, ID);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, EQ_F, _mm_cmpeq_ps, ID, ID, ID);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, NE_F, _mm_cmpneq_ps, ID, ID, ID);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, LT_F, _mm_cmplt_ps, ID, ID, ID);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, LE_F, _mm_cmple_ps, ID, ID, ID);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, GT_F, _mm_cmpgt_ps, ID, ID, ID);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, GE_F, _mm_cmpge_ps, ID, ID, ID);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, ADD_S32, _mm_add_epi32, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, SUB_S32, _mm_sub_epi32, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, EQ_S32, _mm_cmpeq_epi32, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, LT_S32, _mm_cmplt_epi32, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, GT_S32, _mm_cmpgt_epi32, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, AND_S32, _mm_and_ps, ID, ID, ID);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, OR_S32, _mm_or_ps, ID, ID, ID);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, XOR_S32, _mm_xor_ps, ID, ID, ID);\n"
+"VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, EQ_S16, _mm_cmpeq_epi32, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, ADD_S16, _mm_add_epi16, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, SUB_S16, _mm_sub_epi16, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, AND_S16, _mm_and_ps, ID, ID, ID);\n"
+"VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, OR_S16, _mm_or_ps, ID, ID, ID);\n"
+"VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, XOR_S16, _mm_xor_ps, ID, ID, ID);\n"
"#undef VEC_OP\n"
"\n"
"/* Vector integer operations that we can get by switching argument order */\n"
"template <uint32_t vectorNum>\\\n"
"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\\\n"
" for (uint32_t i = 0; i < vectorNum; ++i)\\\n"
-" dst.m[i] = _mm_xor_ps(FN(INTRINSIC_NAME(FN1(v0.m[i]), FN0(v1.m[i]))), alltrue.v);\\\n"
+" dst.m[i] = _mm_xor_ps(FN(INTRINSIC_NAME(FN1(v0.m[i]), FN0(v1.m[i]))), allTrue32.v);\\\n"
"}\\\n"
"template <uint32_t vectorNum>\\\n"
"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\\\n"
"#undef VEC_OP\n"
"\n"
"/* Vector binary integer operations that require C */\n"
-"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, OP, FIELD)\\\n"
+"#define VEC_OP(DST_TYPE, SRC_TYPE, SCALAR_TYPE, CAST_TYPE, NAME, OP, FIELD)\\\n"
"template <uint32_t vectorNum>\\\n"
"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\\\n"
" for (uint32_t i = 0; i < vectorNum; ++i) {\\\n"
-" cast_dw c0(v0.m[i]), c1(v1.m[i]), d;\\\n"
+" CAST_TYPE c0(v0.m[i]), c1(v1.m[i]), d;\\\n"
" for (uint32_t j = 0; j < 4; ++j)\\\n"
-" d.FIELD[j] = c0.FIELD[j] OP c1.FIELD[j];\\\n"
+" d.FIELD = c0.FIELD OP c1.FIELD;\\\n"
" dst.m[i] = d.v;\\\n"
" }\\\n"
"}\\\n"
"template <uint32_t vectorNum>\\\n"
-"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\\\n"
-" NAME(dst, v0, simd_dw<vectorNum>(v1));\\\n"
+"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SCALAR_TYPE &v1) {\\\n"
+" NAME(dst, v0, SRC_TYPE(v1));\\\n"
"}\\\n"
"template <uint32_t vectorNum>\\\n"
-"INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\\\n"
-" NAME(dst, simd_dw<vectorNum>(v0), v1);\\\n"
+"INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SRC_TYPE &v1) {\\\n"
+" NAME(dst, SRC_TYPE(v0), v1);\\\n"
"}\\\n"
"template <uint32_t vectorNum>\\\n"
-"INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const scalar_dw &v1) {\\\n"
-" NAME(dst, simd_dw<vectorNum>(v0), simd_dw<vectorNum>(v1));\\\n"
-"}\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, MUL_S32, *, s);\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, DIV_S32, /, s);\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, REM_S32, %, s);\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, MUL_U32, *, u);\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, DIV_U32, /, u);\n"
-"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, REM_U32, %, u);\n"
+"INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SCALAR_TYPE &v1) {\\\n"
+" NAME(dst, SRC_TYPE(v0), SRC_TYPE(v1));\\\n"
+"}\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, MUL_S32, *, s[j]);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, DIV_S32, /, s[j]);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, REM_S32, %, s[j]);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, MUL_U32, *, u[j]);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, DIV_U32, /, u[j]);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, REM_U32, %, u[j]);\n"
+"VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, MUL_S16, *, s[j].v);\n"
+"VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, DIV_S16, /, s[j].v);\n"
+"VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, REM_S16, %, s[j].v);\n"
+"VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, MUL_U16, *, u[j].v);\n"
+"VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, DIV_U16, /, u[j].v);\n"
+"VEC_OP(simd_w<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, REM_U16, %, u[j].v);\n"
"#undef VEC_OP\n"
"\n"
"/* Vector compare vectors that require C */\n"
-"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, OP, FIELD)\\\n"
+"#define VEC_OP(DST_TYPE, SRC_TYPE, SCALAR_TYPE, CAST_TYPE, NAME, OP, FIELD)\\\n"
"template <uint32_t vectorNum>\\\n"
"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\\\n"
" for (uint32_t i = 0; i < vectorNum; ++i) {\\\n"
-" cast_dw c0(v0.m[i]), c1(v1.m[i]), d;\\\n"
+" CAST_TYPE c0(v0.m[i]), c1(v1.m[i]);\\\n"
+" cast_dw d;\\\n"
" for (uint32_t j = 0; j < 4; ++j)\\\n"
-" d.u[j] = (c0.FIELD[j] OP c1.FIELD[j]) ? ~0u : 0u;\\\n"
+" d.u[j] = (c0.FIELD OP c1.FIELD) ? ~0u : 0u;\\\n"
" dst.m[i] = d.v;\\\n"
" }\\\n"
"}\\\n"
"template <uint32_t vectorNum>\\\n"
-"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\\\n"
-" for (uint32_t i = 0; i < vectorNum; ++i) {\\\n"
-" cast_dw c0(v0.m[i]), d;\\\n"
-" for (uint32_t j = 0; j < 4; ++j)\\\n"
-" d.u[j] = (c0.FIELD[j] OP v1.FIELD) ? ~0u : 0u;\\\n"
-" dst.m[i] = d.v;\\\n"
-" }\\\n"
+"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SCALAR_TYPE &v1) {\\\n"
+" NAME(dst, v0, SRC_TYPE(v1));\\\n"
"}\\\n"
"template <uint32_t vectorNum>\\\n"
-"INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\\\n"
-" for (uint32_t i = 0; i < vectorNum; ++i) {\\\n"
-" cast_dw c1(v1.m[i]), d;\\\n"
-" for (uint32_t j = 0; j < 4; ++j)\\\n"
-" d.u[j] = (v0.FIELD OP c1.FIELD[j]) ? ~0u : 0u;\\\n"
-" dst.m[i] = d.v;\\\n"
-" }\\\n"
-"}\n"
-"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LE_U32, <=, u);\n"
-"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LT_U32, <, u);\n"
-"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GE_U32, >=, u);\n"
-"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GT_U32, >, u);\n"
+"INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SRC_TYPE &v1) {\\\n"
+" NAME(dst, SRC_TYPE(v0), v1);\\\n"
+"}\\\n"
+"template <uint32_t vectorNum>\\\n"
+"INLINE void NAME(DST_TYPE &dst, const SCALAR_TYPE &v0, const SCALAR_TYPE &v1) {\\\n"
+" NAME(dst, SRC_TYPE(v0), SRC_TYPE(v1));\\\n"
+"}\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, LE_U32, <=, u[j]);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, LT_U32, <, u[j]);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, GE_U32, >=, u[j]);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, scalar_dw, cast_dw, GT_U32, >, u[j]);\n"
+"VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, LE_U16, <=, u[j].v);\n"
+"VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, LT_U16, <, u[j].v);\n"
+"VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, GE_U16, >=, u[j].v);\n"
+"VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, GT_U16, >, u[j].v);\n"
+"VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, LE_S16, <=, s[j].v);\n"
+"VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, LT_S16, <, s[j].v);\n"
+"VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, GE_S16, >=, s[j].v);\n"
+"VEC_OP(simd_m<vectorNum>, simd_w<vectorNum>, scalar_w, cast_w, GT_S16, >, s[j].v);\n"
"#undef VEC_OP\n"
"\n"
+"/* Get NE from EQ */\n"
"template <uint32_t vectorNum>\n"
"INLINE void NE_S32(simd_m<vectorNum> &dst,\n"
" const simd_dw<vectorNum> &v0,\n"
" const simd_dw<vectorNum> &v1)\n"
"{\n"
" for (uint32_t i = 0; i < vectorNum; ++i)\n"
-" dst.m[i] = _mm_xor_ps(alltrue.v, SI2PS(_mm_cmpeq_epi32(PS2SI(v0.m[i]), PS2SI(v1.m[i]))));\n"
+" dst.m[i] = _mm_xor_ps(allTrue32.v, SI2PS(_mm_cmpeq_epi32(PS2SI(v0.m[i]), PS2SI(v1.m[i]))));\n"
"}\n"
"template <uint32_t vectorNum>\n"
"INLINE void NE_S32(simd_m<vectorNum> &dst,\n"
"{\n"
" NE_S32(dst, simd_dw<vectorNum>(v0), v1);\n"
"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void NE_S16(simd_m<vectorNum> &dst,\n"
+" const simd_w<vectorNum> &v0,\n"
+" const simd_w<vectorNum> &v1)\n"
+"{\n"
+" for (uint32_t i = 0; i < vectorNum; ++i)\n"
+" dst.m[i] = _mm_xor_ps(allTrue32.v, SI2PS(_mm_cmpeq_epi32(PS2SI(v0.m[i]), PS2SI(v1.m[i]))));\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void NE_S16(simd_m<vectorNum> &dst,\n"
+" const simd_w<vectorNum> &v0,\n"
+" const scalar_w &v1)\n"
+"{\n"
+" NE_S16(dst, v0, simd_w<vectorNum>(v1));\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void NE_S16(simd_m<vectorNum> &dst,\n"
+" const scalar_w &v0,\n"
+" const simd_w<vectorNum> &v1)\n"
+"{\n"
+" NE_S16(dst, simd_w<vectorNum>(v0), v1);\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void NE_S16(simd_m<vectorNum> &dst,\n"
+" const scalar_w &v0,\n"
+" const scalar_w &v1)\n"
+"{\n"
+" NE_S16(dst, simd_w<vectorNum>(v0), simd_w<vectorNum>(v1));\n"
+"}\n"
"\n"
"/* Load from contiguous double words */\n"
"template <uint32_t vectorNum>\n"
" _mm_storeu_ps((float*) ptr + 4*i, src.m[i]);\n"
"}\n"
"\n"
+"/* Load from contiguous words */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void LOAD(simd_w<vectorNum> &dst, const char *ptr) {\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) {\n"
+" const uint16_t u0 = *((uint16_t*) ptr + 4*i + 0);\n"
+" const uint16_t u1 = *((uint16_t*) ptr + 4*i + 1);\n"
+" const uint16_t u2 = *((uint16_t*) ptr + 4*i + 2);\n"
+" const uint16_t u3 = *((uint16_t*) ptr + 4*i + 3);\n"
+" const cast_w w(u0,u1,u2,u3);\n"
+" dst.m[i] = w.v;\n"
+" }\n"
+"}\n"
+"\n"
+"/* Store to contiguous words */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void STORE(const simd_w<vectorNum> &src, char *ptr) {\n"
+" for (uint32_t i = 0; i < vectorNum; ++i) {\n"
+" const cast_w w(src.m[i]);\n"
+" *((uint16_t*) ptr + 4*i + 0) = w.u[0].v;\n"
+" *((uint16_t*) ptr + 4*i + 1) = w.u[1].v;\n"
+" *((uint16_t*) ptr + 4*i + 2) = w.u[2].v;\n"
+" *((uint16_t*) ptr + 4*i + 3) = w.u[3].v;\n"
+" }\n"
+"}\n"
+"\n"
"/* Load immediates */\n"
"template <uint32_t vectorNum>\n"
"INLINE void LOADI(simd_dw<vectorNum> &dst, uint32_t u) {\n"
"// Scalar instructions\n"
"//////////////////////////////////////////////////////////////////////////////\n"
"INLINE uint32_t elemNum(const scalar_dw &x) { return 1; }\n"
+"INLINE uint32_t elemNum(const scalar_w &x) { return 1; }\n"
"INLINE uint32_t elemNum(const scalar_m &x) { return 1; }\n"
"INLINE uint32_t mask(const scalar_m &v) { return v.u ? 1 : 0; }\n"
+"\n"
+"// 32 bit floating points\n"
"INLINE void ADD_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f + v1.f; }\n"
"INLINE void SUB_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f - v1.f; }\n"
"INLINE void MUL_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f * v1.f; }\n"
"INLINE void LT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f < v1.f ? ~0 : 0); }\n"
"INLINE void GE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f >= v1.f ? ~0 : 0); }\n"
"INLINE void GT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f > v1.f ? ~0 : 0); }\n"
+"\n"
+"// 32 bit integers\n"
"INLINE void ADD_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s + v1.s; }\n"
"INLINE void SUB_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s - v1.s; }\n"
"INLINE void MUL_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s * v1.s; }\n"
"INLINE void SCATTER(scalar_dw offset, scalar_dw value, char *base) { *(uint32_t*)(base + offset.u) = value.u; }\n"
"INLINE void GATHER(scalar_dw &dst, scalar_dw offset, const char *base) { dst.u = *(const uint32_t*)(base + offset.u); }\n"
"\n"
+"// 16 bit floating points\n"
+"INLINE void ADD_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u + v1.u; }\n"
+"INLINE void SUB_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u - v1.u; }\n"
+"INLINE void ADD_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s + v1.s; }\n"
+"INLINE void SUB_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s - v1.s; }\n"
+"INLINE void MUL_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s * v1.s; }\n"
+"INLINE void DIV_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s / v1.s; }\n"
+"INLINE void REM_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s % v1.s; }\n"
+"INLINE void MUL_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u * v1.u; }\n"
+"INLINE void DIV_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u / v1.u; }\n"
+"INLINE void REM_U16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.u = v0.u % v1.u; }\n"
+"INLINE void EQ_S16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.s == v1.s ? ~0 : 0); }\n"
+"INLINE void NE_S16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.s != v1.s ? ~0 : 0); }\n"
+"INLINE void LE_S16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.s <= v1.s ? ~0 : 0); }\n"
+"INLINE void LT_S16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.s < v1.s ? ~0 : 0); }\n"
+"INLINE void GE_S16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.s >= v1.s ? ~0 : 0); }\n"
+"INLINE void GT_S16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.s > v1.s ? ~0 : 0); }\n"
+"INLINE void XOR_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s ^ v1.s; }\n"
+"INLINE void OR_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s | v1.s; }\n"
+"INLINE void AND_S16(scalar_w &dst, scalar_w v0, scalar_w v1) { dst.s = v0.s & v1.s; }\n"
+"INLINE void LE_U16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.u <= v1.u ? ~0 : 0); }\n"
+"INLINE void LT_U16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.u < v1.u ? ~0 : 0); }\n"
+"INLINE void GE_U16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.u >= v1.u ? ~0 : 0); }\n"
+"INLINE void GT_U16(scalar_m &dst, scalar_w v0, scalar_w v1) { dst.u = (v0.u > v1.u ? ~0 : 0); }\n"
+"INLINE void LOAD(scalar_w &dst, const char *ptr) { dst.u = *(const uint16_t *) ptr; }\n"
+"INLINE void STORE(scalar_w src, char *ptr) { *(uint16_t *) ptr = src.u; }\n"
+"INLINE void LOADI(scalar_w &dst, uint16_t u) { dst.u = u; }\n"
+"INLINE void SCATTER(scalar_w offset, scalar_w value, char *base) { *(uint16_t*)(base + offset.u) = value.u; }\n"
+"INLINE void GATHER(scalar_w &dst, scalar_w offset, const char *base) { dst.u = *(const uint16_t*)(base + offset.u); }\n"
+"\n"
"//////////////////////////////////////////////////////////////////////////////\n"
"// Identical instructions are forwarded\n"
"//////////////////////////////////////////////////////////////////////////////\n"
"\n"
+"// Forward identical 32 bit instructions\n"
"#define NOV_U32 MOV_S32\n"
"#define NOV_F MOV_S32\n"
"#define ADD_U32 ADD_S32\n"
"#define EQ_U32 EQ_S32\n"
"#define NE_U32 NE_S32\n"
"\n"
+"// Forward identical 16 bit instructions\n"
+"#define NOV_U16 MOV_S16\n"
+"#define ADD_U16 ADD_S16\n"
+"#define SUB_U16 SUB_S16\n"
+"#define AND_U16 AND_S16\n"
+"#define XOR_U16 XOR_S16\n"
+"#define OR_U16 OR_S16\n"
+"#define AND_U16 AND_S16\n"
+"#define EQ_U16 EQ_S16\n"
+"#define NE_U16 NE_S16\n"
+"\n"
"#undef PS2SI\n"
"#undef SI2PS\n"
"#undef ID\n"
void SimContext::emitMaskingCode(void) {
o << "simd" << simdWidth << "m " << "emask;\n"
- << "simd" << simdWidth << "dw " << "uip(scalar_dw(0u));\n"
+ //<< "simd" << simdWidth << "dw " << "uip(scalar_dw(0u));\n"
+ << "simd" << simdWidth << "dw " << "uip;\n"
<< "alltrueMask(emask);\n"
<< "uint32_t movedMask = ~0x0u;\n";
}
+++ /dev/null
-/*
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- * Author: Benjamin Segovia <benjamin.segovia@intel.com>
- */
-
-/**
- * \file llvm_gen_backend.cpp
- * \author Benjamin Segovia <benjamin.segovia@intel.com>
- *
- * Somehow a ad-hoc pass to remove the extra moves introduced by LLVM to Gen IR
- * pass
- */
-
}
static INLINE bool ok(int32_t x, int32_t y) { return x == y; }
static INLINE bool ok(uint32_t x, uint32_t y) { return x == y; }
+static INLINE bool ok(int16_t x, int16_t y) { return x == y; }
+static INLINE bool ok(uint16_t x, uint16_t y) { return x == y; }
#define CHECK_BINARY_OP(TYPE,FN,OP,DST,SRC0,SRC1,ELEM0,ELEM1)\
do {\
CHECK_BINARY_OP(uint32_t,OR_U32, |,_5,_4,_0,data[index4],data[index0]);
}
}
+
+static void utestINT16(void)
+{
+ simd1w _0, _4, _5;
+ simd16w _1, _2, _3;
+ const int16_t data[32] = {-1,1,-2,-3,4,-5,6,7,-8,9,10,11,12,13,14,15,8,
+ 9,10,11,12,-13,14,-15,-1,1,-2,3,4,5,6,7};
+ for (uint32_t i = 0; i < 32; ++i) {
+ const int index0 = rand() % 32;
+ const int index1 = rand() % 16;
+ const int index2 = rand() % 16;
+ const int index4 = rand() % 32;
+ LOAD(_0, (const char *) (data+index0));
+ LOAD(_1, (const char *) (data+index1));
+ LOAD(_2, (const char *) (data+index2));
+ LOAD(_4, (const char *) (data+index4));
+ CHECK_BINARY_OP(int16_t,ADD_S16,+,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(int16_t,SUB_S16,-,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(int16_t,MUL_S16,*,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(int16_t,DIV_S16,/,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(int16_t,REM_S16,%,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(int16_t,AND_S16,&,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(int16_t,XOR_S16,^,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(int16_t,OR_S16, |,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(int16_t,ADD_S16,+,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(int16_t,SUB_S16,-,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(int16_t,MUL_S16,*,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(int16_t,DIV_S16,/,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(int16_t,REM_S16,%,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(int16_t,AND_S16,&,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(int16_t,XOR_S16,^,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(int16_t,OR_S16, |,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(int16_t,ADD_S16,+,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(int16_t,SUB_S16,-,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(int16_t,MUL_S16,*,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(int16_t,DIV_S16,/,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(int16_t,REM_S16,%,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(int16_t,AND_S16,&,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(int16_t,XOR_S16,^,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(int16_t,OR_S16, |,_5,_4,_0,data[index4],data[index0]);
+ }
+}
+
+static void utestUINT16(void)
+{
+ simd1w _0, _4, _5;
+ simd16w _1, _2, _3;
+ const uint16_t data[32] = {1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,8,
+ 9,10,11,12,13,14,15,1,1,2,3,4,5,6,7};
+ for (uint32_t i = 0; i < 32; ++i) {
+ const int index0 = rand() % 32;
+ const int index1 = rand() % 16;
+ const int index2 = rand() % 16;
+ const int index4 = rand() % 32;
+ LOAD(_0, (const char *) (data+index0));
+ LOAD(_1, (const char *) (data+index1));
+ LOAD(_2, (const char *) (data+index2));
+ LOAD(_4, (const char *) (data+index4));
+ CHECK_BINARY_OP(uint16_t,ADD_U16,+,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(uint16_t,SUB_U16,-,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(uint16_t,MUL_U16,*,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(uint16_t,DIV_U16,/,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(uint16_t,REM_U16,%,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(uint16_t,AND_U16,&,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(uint16_t,XOR_U16,^,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(uint16_t,OR_U16, |,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_BINARY_OP(uint16_t,ADD_U16,+,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(uint16_t,SUB_U16,-,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(uint16_t,MUL_U16,*,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(uint16_t,DIV_U16,/,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(uint16_t,REM_U16,%,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(uint16_t,AND_U16,&,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(uint16_t,XOR_U16,^,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(uint16_t,OR_U16, |,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_BINARY_OP(uint16_t,ADD_U16,+,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(uint16_t,SUB_U16,-,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(uint16_t,MUL_U16,*,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(uint16_t,DIV_U16,/,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(uint16_t,REM_U16,%,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(uint16_t,AND_U16,&,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(uint16_t,XOR_U16,^,_5,_4,_0,data[index4],data[index0]);
+ CHECK_BINARY_OP(uint16_t,OR_U16, |,_5,_4,_0,data[index4],data[index0]);
+ }
+}
+
#undef CHECK_BINARY_OP
#define CHECK_CMP_OP(FN,OP,DST,SRC0,SRC1,ELEM0,ELEM1)\
}
}
+static void utestUINT16Cmp(void)
+{
+ simd1w _0, _4;
+ simd16w _1, _2;
+ simd8w _6, _7;
+ simd1m _5;
+ simd16m _3;
+ simd8m _8;
+ const uint16_t data[64] = {11,12,13,14,15,8,1,1,2,3,4,5,6,7,8,9,10,
+ 9,10,11,12,13,14,15,1,1,2,3,4,5,6,7,
+ 10,11,12,13,14,15,8,1,1,2,3,4,5,6,7,8,9,
+ 9,10,11,12,13,14,15,1,1,2,3,4,5,6,7};
+ for (uint32_t i = 0; i < 32; ++i) {
+ const int index0 = rand() % 32;
+ const int index1 = rand() % 16;
+ const int index2 = rand() % 16;
+ const int index4 = rand() % 32;
+ const int index6 = rand() % 16;
+ const int index7 = rand() % 32;
+ LOAD(_0, (const char *) (data+index0));
+ LOAD(_1, (const char *) (data+index1));
+ LOAD(_2, (const char *) (data+index2));
+ LOAD(_4, (const char *) (data+index4));
+ LOAD(_6, (const char *) (data+index6));
+ LOAD(_7, (const char *) (data+index7));
+ CHECK_CMP_OP(GE_U16,>=,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_CMP_OP(LE_U16,<=,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_CMP_OP(GT_U16,>,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_CMP_OP(LT_U16,<,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_CMP_OP(EQ_U16,==,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_CMP_OP(NE_U16,!=,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_CMP_OP(GE_U16,>=,_8,_7,_6,data[i+index7],data[i+index6]);
+ CHECK_CMP_OP(LE_U16,<=,_8,_7,_6,data[i+index7],data[i+index6]);
+ CHECK_CMP_OP(GT_U16,>,_8,_7,_6,data[i+index7],data[i+index6]);
+ CHECK_CMP_OP(LT_U16,<,_8,_7,_6,data[i+index7],data[i+index6]);
+ CHECK_CMP_OP(EQ_U16,==,_8,_7,_6,data[i+index7],data[i+index6]);
+ CHECK_CMP_OP(NE_U16,!=,_8,_7,_6,data[i+index7],data[i+index6]);
+ CHECK_CMP_OP(GE_U16,>=,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_CMP_OP(LE_U16,<=,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_CMP_OP(GT_U16,>,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_CMP_OP(LT_U16,<,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_CMP_OP(EQ_U16,==,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_CMP_OP(NE_U16,!=,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_CMP_OP(GE_U16,>=,_5,_4,_0,data[index4],data[index0]);
+ CHECK_CMP_OP(LE_U16,<=,_5,_4,_0,data[index4],data[index0]);
+ CHECK_CMP_OP(GT_U16,>,_5,_4,_0,data[index4],data[index0]);
+ CHECK_CMP_OP(LT_U16,<,_5,_4,_0,data[index4],data[index0]);
+ CHECK_CMP_OP(EQ_U16,==,_5,_4,_0,data[index4],data[index0]);
+ CHECK_CMP_OP(NE_U16,!=,_5,_4,_0,data[index4],data[index0]);
+ }
+}
+
+static void utestINT16Cmp(void)
+{
+ simd1w _0, _4;
+ simd16w _1, _2;
+ simd8w _6, _7;
+ simd1m _5;
+ simd16m _3;
+ simd8m _8;
+ const int16_t data[64] = {-11,-12,13,14,-15,8,-1,-1,2,3,4,5,-6,7,8,9,10,
+ 9,10,-11,12,-13,14,15,1,1,2,-3,4,-5,6,7,
+ 10,11,-12,13,14,15,-8,1,1,2,-3,-4,5,-6,7,8,9,
+ 9,10,11,12,-13,14,15,-1,-1,-2,-3,-4,5,6,7};
+
+ for (uint32_t i = 0; i < 32; ++i) {
+ const int index0 = rand() % 32;
+ const int index1 = rand() % 16;
+ const int index2 = rand() % 16;
+ const int index4 = rand() % 32;
+ const int index6 = rand() % 16;
+ const int index7 = rand() % 32;
+ LOAD(_0, (const char *) (data+index0));
+ LOAD(_1, (const char *) (data+index1));
+ LOAD(_2, (const char *) (data+index2));
+ LOAD(_4, (const char *) (data+index4));
+ LOAD(_6, (const char *) (data+index6));
+ LOAD(_7, (const char *) (data+index7));
+ CHECK_CMP_OP(GE_S16,>=,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_CMP_OP(LE_S16,<=,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_CMP_OP(GT_S16,>,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_CMP_OP(LT_S16,<,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_CMP_OP(EQ_S16,==,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_CMP_OP(NE_S16,!=,_3,_2,_1,data[i+index2],data[i+index1]);
+ CHECK_CMP_OP(GE_S16,>=,_8,_7,_6,data[i+index7],data[i+index6]);
+ CHECK_CMP_OP(LE_S16,<=,_8,_7,_6,data[i+index7],data[i+index6]);
+ CHECK_CMP_OP(GT_S16,>,_8,_7,_6,data[i+index7],data[i+index6]);
+ CHECK_CMP_OP(LT_S16,<,_8,_7,_6,data[i+index7],data[i+index6]);
+ CHECK_CMP_OP(EQ_S16,==,_8,_7,_6,data[i+index7],data[i+index6]);
+ CHECK_CMP_OP(NE_S16,!=,_8,_7,_6,data[i+index7],data[i+index6]);
+ CHECK_CMP_OP(GE_S16,>=,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_CMP_OP(LE_S16,<=,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_CMP_OP(GT_S16,>,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_CMP_OP(LT_S16,<,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_CMP_OP(EQ_S16,==,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_CMP_OP(NE_S16,!=,_3,_2,_0,data[i+index2],data[index0]);
+ CHECK_CMP_OP(GE_S16,>=,_5,_4,_0,data[index4],data[index0]);
+ CHECK_CMP_OP(LE_S16,<=,_5,_4,_0,data[index4],data[index0]);
+ CHECK_CMP_OP(GT_S16,>,_5,_4,_0,data[index4],data[index0]);
+ CHECK_CMP_OP(LT_S16,<,_5,_4,_0,data[index4],data[index0]);
+ CHECK_CMP_OP(EQ_S16,==,_5,_4,_0,data[index4],data[index0]);
+ CHECK_CMP_OP(NE_S16,!=,_5,_4,_0,data[index4],data[index0]);
+ }
+}
+
static void utestFPCmp(void)
{
simd1dw _0, _4;
UTEST_EXPECT_SUCCESS(utestFP());
UTEST_EXPECT_SUCCESS(utestINT32());
UTEST_EXPECT_SUCCESS(utestUINT32());
+ UTEST_EXPECT_SUCCESS(utestINT16());
+ UTEST_EXPECT_SUCCESS(utestUINT16());
UTEST_EXPECT_SUCCESS(utestFPCmp());
UTEST_EXPECT_SUCCESS(utestINT32Cmp());
UTEST_EXPECT_SUCCESS(utestUINT32Cmp());
+ UTEST_EXPECT_SUCCESS(utestINT16Cmp());
+ UTEST_EXPECT_SUCCESS(utestUINT16Cmp());
UTEST_EXPECT_SUCCESS(utestScatterGather());
}