the popcount function returns the number of non-zero bits in input.
use GEN instruction cbit(Count Bits Set) to implement it.
Signed-off-by: Luo Xionghu <xionghu.luo@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
[GEN_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_FBH] = { .name = "fbh", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_FBL] = { .name = "fbl", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_CBIT] = { .name = "cbit", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_F16TO32] = { .name = "f16to32", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_F32TO16] = { .name = "f32to16", .nsrc = 1, .ndst = 1 },
case SEL_OP_READ_ARF: p->MOV(dst, src); break;
case SEL_OP_FBH: p->FBH(dst, src); break;
case SEL_OP_FBL: p->FBL(dst, src); break;
+ case SEL_OP_CBIT: p->CBIT(dst, src); break;
case SEL_OP_NOT: p->NOT(dst, src); break;
case SEL_OP_RNDD: p->RNDD(dst, src); break;
case SEL_OP_RNDU: p->RNDU(dst, src); break;
GEN_OPCODE_LZD = 74,
GEN_OPCODE_FBH = 75,
GEN_OPCODE_FBL = 76,
+ GEN_OPCODE_CBIT = 77,
GEN_OPCODE_ADDC = 78,
GEN_OPCODE_SUBB = 79,
GEN_OPCODE_SAD2 = 80,
ALU1(RNDU)
ALU1(FBH)
ALU1(FBL)
+ ALU1(CBIT)
ALU1(F16TO32)
ALU1(F32TO16)
ALU2(SEL)
ALU1(MOV)
ALU1(FBH)
ALU1(FBL)
+ ALU1(CBIT)
ALU2(SUBB)
ALU2(UPSAMPLE_SHORT)
ALU2(UPSAMPLE_INT)
ALU2WithTemp(MUL_HI)
ALU1(FBH)
ALU1(FBL)
+ ALU1(CBIT)
ALU2WithTemp(HADD)
ALU2WithTemp(RHADD)
ALU2(UPSAMPLE_SHORT)
static ir::Type getType(const ir::Opcode opcode, const ir::Type insnType) {
if (insnType == ir::TYPE_S64 || insnType == ir::TYPE_U64 || insnType == ir::TYPE_S8 || insnType == ir::TYPE_U8)
return insnType;
- if (opcode == ir::OP_FBH || opcode == ir::OP_FBL)
+ if (opcode == ir::OP_FBH || opcode == ir::OP_FBL || opcode == ir::OP_CBIT)
return ir::TYPE_U32;
if (insnType == ir::TYPE_S16 || insnType == ir::TYPE_U16)
return insnType;
case ir::OP_RNDZ: sel.RNDZ(dst, src); break;
case ir::OP_FBH: sel.FBH(dst, src); break;
case ir::OP_FBL: sel.FBL(dst, src); break;
+ case ir::OP_CBIT: sel.CBIT(dst, src); break;
case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break;
case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break;
case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break;
DECL_SELECTION_IR(I64_MUL_HI, I64MULHIInstruction)
DECL_SELECTION_IR(FBH, UnaryInstruction)
DECL_SELECTION_IR(FBL, UnaryInstruction)
+DECL_SELECTION_IR(CBIT, UnaryInstruction)
DECL_SELECTION_IR(HADD, BinaryWithTempInstruction)
DECL_SELECTION_IR(RHADD, BinaryWithTempInstruction)
DECL_SELECTION_IR(I64HADD, I64HADDInstruction)
DECL_EMIT_FUNCTION(MOV)
DECL_EMIT_FUNCTION(FBH)
DECL_EMIT_FUNCTION(FBL)
+ DECL_EMIT_FUNCTION(CBIT)
DECL_EMIT_FUNCTION(COS)
DECL_EMIT_FUNCTION(SIN)
DECL_EMIT_FUNCTION(LOG)
Instruction FBH(Type type, Register dst, Register src);
/*! fbl.type dst src */
Instruction FBL(Type type, Register dst, Register src);
+ /*! cbit.type dst src */
+ Instruction CBIT(Type type, Register dst, Register src);
/*! hadd.type dst src */
Instruction HADD(Type type, Register dst, Register src0, Register src1);
/*! rhadd.type dst src */
DECL_INSN(I64_MUL_HI, BinaryInstruction)
DECL_INSN(FBH, UnaryInstruction)
DECL_INSN(FBL, UnaryInstruction)
+DECL_INSN(CBIT, UnaryInstruction)
DECL_INSN(HADD, BinaryInstruction)
DECL_INSN(RHADD, BinaryInstruction)
DECL_INSN(I64HADD, BinaryInstruction)
longn upsample (intn hi, uintn lo)
ulongn upsample (uintn hi, uintn lo)
-# XXX not implemented
-#gentype popcount (gentype x)
+gentype popcount (gentype x)
##fast_integer
gentype mad24 (gentype x, gentype y, gentype z)
PURE CONST uint __gen_ocl_fbh(uint);
PURE CONST uint __gen_ocl_fbl(uint);
+PURE CONST uint __gen_ocl_cbit(uint);
OVERLOADABLE char clz(char x) {
if (x < 0)
return v;
}
+OVERLOADABLE char popcount(char x) {
+ return x == 0 ? 0 : x < 0?__gen_ocl_cbit(x) - 24 : __gen_ocl_cbit(x);
+}
+OVERLOADABLE short popcount(short x) {
+ return x == 0 ? 0 : x < 0?__gen_ocl_cbit(x) - 16 : __gen_ocl_cbit(x);
+}
+#define SDEF(TYPE) \
+OVERLOADABLE TYPE popcount(TYPE x){ return x == 0? 0:__gen_ocl_cbit(x);}
+SDEF(uchar);
+SDEF(ushort);
+SDEF(int);
+SDEF(uint);
+#undef SDEF
+
+OVERLOADABLE long popcount(long x) {
+ union { int i[2]; long x; } u;
+ u.x = x;
+ uint v = popcount(u.i[1]);
+ v += popcount(u.i[0]);
+ return v;
+}
+
+OVERLOADABLE ulong popcount(ulong x) {
+ union { uint i[2]; ulong x; } u;
+ u.x = x;
+ uint v = popcount(u.i[1]);
+ v += popcount(u.i[0]);
+ return v;
+}
// sat
#define SDEF(TYPE) \
OVERLOADABLE long clz(long x);
OVERLOADABLE ulong clz(ulong x);
+OVERLOADABLE char popcount(char x);
+OVERLOADABLE uchar popcount(uchar x);
+OVERLOADABLE short popcount(short x);
+OVERLOADABLE ushort popcount(ushort x);
+OVERLOADABLE int popcount(int x);
+OVERLOADABLE uint popcount(uint x);
+OVERLOADABLE long popcount(long x);
+OVERLOADABLE ulong popcount(ulong x);
+
OVERLOADABLE char mul_hi(char x, char y);
OVERLOADABLE uchar mul_hi(uchar x, uchar y);
OVERLOADABLE short mul_hi(short x, short y);
regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
case GEN_OCL_FBH:
case GEN_OCL_FBL:
+ case GEN_OCL_CBIT:
case GEN_OCL_COS:
case GEN_OCL_SIN:
case GEN_OCL_SQR:
}
case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
+ case GEN_OCL_CBIT: this->emitUnaryCallInst(I,CS,ir::OP_CBIT); break;
case GEN_OCL_ABS:
{
const ir::Register src = this->getRegister(*AI);
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
+DECL_LLVM_GEN_FUNCTION(CBIT, __gen_ocl_cbit)
// saturate convert
DECL_LLVM_GEN_FUNCTION(SAT_CONV_U8_TO_I8, _Z16convert_char_sath)