intel/fs: add 64 bit integer multiplication lowering

author Paulo Zanoni <paulo.r.zanoni@intel.com>

Thu, 11 Jul 2019 22:08:03 +0000 (15:08 -0700)

committer Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>

Mon, 12 Aug 2019 22:16:23 +0000 (15:16 -0700)
author Paulo Zanoni <paulo.r.zanoni@intel.com>
Thu, 11 Jul 2019 22:08:03 +0000 (15:08 -0700)
committer Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Mon, 12 Aug 2019 22:16:23 +0000 (15:16 -0700)
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp

index ccf6c95..a4ebae3 100644 (file)
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -3991,6 +3991,62 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
  }
  
  void
+fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block)
+{
+   const fs_builder ibld(this, block, inst);
+
+   /* Considering two 64-bit integers ab and cd where each letter        ab
+    * corresponds to 32 bits, we get a 128-bit result WXYZ. We         * cd
+    * only need to provide the YZ part of the result.               -------
+    *                                                                    BD
+    *  Only BD needs to be 64 bits. For AD and BC we only care       +  AD
+    *  about the lower 32 bits (since they are part of the upper     +  BC
+    *  32 bits of our result). AC is not needed since it starts      + AC
+    *  on the 65th bit of the result.                               -------
+    *                                                                  WXYZ
+    */
+   unsigned int q_regs = regs_written(inst);
+   unsigned int d_regs = (q_regs + 1) / 2;
+
+   fs_reg bd(VGRF, alloc.allocate(q_regs), BRW_REGISTER_TYPE_UQ);
+   fs_reg ad(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
+   fs_reg bc(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
+
+   /* Here we need the full 64 bit result for 32b * 32b. */
+   if (devinfo->has_integer_dword_mul) {
+      ibld.MUL(bd, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
+               subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
+   } else {
+      fs_reg bd_high(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
+      fs_reg bd_low(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
+      fs_reg acc = retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD);
+
+      fs_inst *mul = ibld.MUL(acc,
+                            subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
+                            subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
+      mul->writes_accumulator = true;
+
+      ibld.MACH(bd_high, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
+                subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
+      ibld.MOV(bd_low, acc);
+
+      ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low);
+      ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high);
+   }
+
+   ibld.MUL(ad, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
+            subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
+   ibld.MUL(bc, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
+            subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1));
+
+   ibld.ADD(ad, ad, bc);
+   ibld.ADD(subscript(bd, BRW_REGISTER_TYPE_UD, 1),
+            subscript(bd, BRW_REGISTER_TYPE_UD, 1), ad);
+
+   ibld.MOV(inst->dst, bd);
+}
+
+void
  fs_visitor::lower_mulh_inst(fs_inst *inst, bblock_t *block)
  {
     const fs_builder ibld(this, block, inst);
@@ -4062,10 +4118,19 @@ fs_visitor::lower_integer_multiplication()
  
     foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
        if (inst->opcode == BRW_OPCODE_MUL) {
-         if (!inst->dst.is_accumulator() &&
-             (inst->dst.type == BRW_REGISTER_TYPE_D ||
-              inst->dst.type == BRW_REGISTER_TYPE_UD) &&
-             !devinfo->has_integer_dword_mul) {
+         if ((inst->dst.type == BRW_REGISTER_TYPE_Q ||
+              inst->dst.type == BRW_REGISTER_TYPE_UQ) &&
+             (inst->src[0].type == BRW_REGISTER_TYPE_Q ||
+              inst->src[0].type == BRW_REGISTER_TYPE_UQ) &&
+             (inst->src[1].type == BRW_REGISTER_TYPE_Q ||
+              inst->src[1].type == BRW_REGISTER_TYPE_UQ)) {
+            lower_mul_qword_inst(inst, block);
+            inst->remove(block);
+            progress = true;
+         } else if (!inst->dst.is_accumulator() &&
+                    (inst->dst.type == BRW_REGISTER_TYPE_D ||
+                     inst->dst.type == BRW_REGISTER_TYPE_UD) &&
+                    !devinfo->has_integer_dword_mul) {
              lower_mul_dword_inst(inst, block);
              inst->remove(block);
              progress = true;
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h

index fb78fa8..a3604ae 100644 (file)
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -407,6 +407,7 @@ private:
     void resolve_inot_sources(const brw::fs_builder &bld, nir_alu_instr *instr,
                               fs_reg *op);
     void lower_mul_dword_inst(fs_inst *inst, bblock_t *block);
+   void lower_mul_qword_inst(fs_inst *inst, bblock_t *block);
     void lower_mulh_inst(fs_inst *inst, bblock_t *block);
  };
author	Paulo Zanoni <paulo.r.zanoni@intel.com>
	Thu, 11 Jul 2019 22:08:03 +0000 (15:08 -0700)
committer	Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
	Mon, 12 Aug 2019 22:16:23 +0000 (15:16 -0700)
src/intel/compiler/brw_fs.cpp		patch \| blob \| history
src/intel/compiler/brw_fs.h		patch \| blob \| history