From 4150920b95b8a7db84cd4607ede09f42b85530bb Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 31 Aug 2017 21:50:31 -0700 Subject: [PATCH] intel/fs: Add a helper for emitting scan operations This commit adds a helper to the builder for emitting "scan" operations. Given a binary operation #, a scan takes the vector [a0, a1, ..., aN] and returns the vector [a0, a0 # a1, ..., a0 # a1 # ... # aN] where each channel contains the combination of all previous channels. The sequence of instructions to perform the scan is fairly optimal; a 16-wide scan on a 32-bit type is only 6 instructions. The subgroup scan and reduction operations will be implemented in terms of this. Acked-by: Lionel Landwerlin Reviewed-by: Iago Toral Quiroga --- src/intel/compiler/brw_fs_builder.h | 141 ++++++++++++++++++++++++++++++++++++ src/intel/compiler/brw_ir_fs.h | 7 ++ 2 files changed, 148 insertions(+) diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h index b157e33..cf603b0 100644 --- a/src/intel/compiler/brw_fs_builder.h +++ b/src/intel/compiler/brw_fs_builder.h @@ -412,6 +412,115 @@ namespace brw { return src_reg(component(dst, 0)); } + void + emit_scan(enum opcode opcode, const dst_reg &tmp, + unsigned cluster_size, brw_conditional_mod mod) const + { + assert(dispatch_width() >= 8); + + /* The instruction splitting code isn't advanced enough to split + * these so we need to handle that ourselves. + */ + if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) { + const unsigned half_width = dispatch_width() / 2; + const fs_builder ubld = exec_all().group(half_width, 0); + dst_reg left = tmp; + dst_reg right = horiz_offset(tmp, half_width); + ubld.emit_scan(opcode, left, cluster_size, mod); + ubld.emit_scan(opcode, right, cluster_size, mod); + if (cluster_size > half_width) { + src_reg left_comp = component(left, half_width - 1); + set_condmod(mod, ubld.emit(opcode, right, left_comp, right)); + } + return; + } + + if (cluster_size > 1) { + const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0); + dst_reg left = horiz_stride(tmp, 2); + dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2); + + /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn": + * + * "When source or destination datatype is 64b or operation is + * integer DWord multiply, regioning in Align1 must follow + * these rules: + * + * [...] + * + * 3. Source and Destination offset must be the same, except + * the case of scalar source." + * + * In order to work around this, we create a temporary register + * and shift left over to match right. If we have a 64-bit type, + * we have to use two integer MOVs instead of a 64-bit MOV. + */ + if (need_matching_subreg_offset(opcode, tmp.type)) { + dst_reg tmp2 = vgrf(tmp.type); + dst_reg new_left = horiz_stride(horiz_offset(tmp2, 1), 2); + if (type_sz(tmp.type) > 4) { + ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 0), + subscript(left, BRW_REGISTER_TYPE_D, 0)); + ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 1), + subscript(left, BRW_REGISTER_TYPE_D, 1)); + } else { + ubld.MOV(new_left, left); + } + left = new_left; + } + set_condmod(mod, ubld.emit(opcode, right, left, right)); + } + + if (cluster_size > 2) { + if (type_sz(tmp.type) <= 4 && + !need_matching_subreg_offset(opcode, tmp.type)) { + const fs_builder ubld = + exec_all().group(dispatch_width() / 4, 0); + src_reg left = horiz_stride(horiz_offset(tmp, 1), 4); + + dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4); + set_condmod(mod, ubld.emit(opcode, right, left, right)); + + right = horiz_stride(horiz_offset(tmp, 3), 4); + set_condmod(mod, ubld.emit(opcode, right, left, right)); + } else { + /* For 64-bit types, we have to do things differently because + * the code above would land us with destination strides that + * the hardware can't handle. Fortunately, we'll only be + * 8-wide in that case and it's the same number of + * instructions. + */ + const fs_builder ubld = exec_all().group(2, 0); + + for (unsigned i = 0; i < dispatch_width(); i += 4) { + src_reg left = component(tmp, i + 1); + dst_reg right = horiz_offset(tmp, i + 2); + set_condmod(mod, ubld.emit(opcode, right, left, right)); + } + } + } + + if (cluster_size > 4) { + const fs_builder ubld = exec_all().group(4, 0); + src_reg left = component(tmp, 3); + dst_reg right = horiz_offset(tmp, 4); + set_condmod(mod, ubld.emit(opcode, right, left, right)); + + if (dispatch_width() > 8) { + left = component(tmp, 8 + 3); + right = horiz_offset(tmp, 8 + 4); + set_condmod(mod, ubld.emit(opcode, right, left, right)); + } + } + + if (cluster_size > 8 && dispatch_width() > 8) { + const fs_builder ubld = exec_all().group(8, 0); + src_reg left = component(tmp, 7); + dst_reg right = horiz_offset(tmp, 8); + set_condmod(mod, ubld.emit(opcode, right, left, right)); + } + } + /** * Assorted arithmetic ops. * @{ @@ -644,6 +753,38 @@ namespace brw { } } + + /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn": + * + * "When source or destination datatype is 64b or operation is + * integer DWord multiply, regioning in Align1 must follow + * these rules: + * + * [...] + * + * 3. Source and Destination offset must be the same, except + * the case of scalar source." + * + * This helper just detects when we're in this case. + */ + bool + need_matching_subreg_offset(enum opcode opcode, + enum brw_reg_type type) const + { + if (!shader->devinfo->is_cherryview && + !gen_device_info_is_9lp(shader->devinfo)) + return false; + + if (type_sz(type > 4)) + return true; + + if (opcode == BRW_OPCODE_MUL && + !brw_reg_type_is_floating_point(type)) + return true; + + return false; + } + bblock_t *block; exec_node *cursor; diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h index cd60363..54797ff 100644 --- a/src/intel/compiler/brw_ir_fs.h +++ b/src/intel/compiler/brw_ir_fs.h @@ -312,6 +312,13 @@ subscript(fs_reg reg, brw_reg_type type, unsigned i) return byte_offset(retype(reg, type), i * type_sz(type)); } +static inline fs_reg +horiz_stride(fs_reg reg, unsigned s) +{ + reg.stride *= s; + return reg; +} + static const fs_reg reg_undef; class fs_inst : public backend_instruction { -- 2.7.4