From 20dfa501b3950c1d2f5da7126091792f5eb67038 Mon Sep 17 00:00:00 2001 From: Chad Versace Date: Wed, 9 Jan 2013 11:46:42 -0800 Subject: [PATCH] i965/fs/gen7: Emit code for GLSL 3.00 pack/unpack operations (v4) v2: Remove lewd comment. [for idr] v3: - Optimize away tmp register for packHalf2x16. [for anholt, paul] - Improve comments. [for anholt, paul] - Reduce near-duplicate code by removing vec4_visitor emit_pack/unpack methods. [for chadv] v4: Factor our UD/W register conversion into helper function. [for anholt] Reviewed-by: Eric Anholt Reviewed-by: Ian Romanick (v2) Signed-off-by: Chad Versace --- src/mesa/drivers/dri/i965/brw_defines.h | 3 + src/mesa/drivers/dri/i965/brw_fs.h | 8 ++ .../dri/i965/brw_fs_channel_expressions.cpp | 12 +++ src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 105 ++++++++++++++++++++- src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 19 +++- 5 files changed, 144 insertions(+), 3 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index e2f1e65..79cc12f 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -726,6 +726,9 @@ enum opcode { FS_OPCODE_MOV_DISPATCH_TO_FLAGS, FS_OPCODE_DISCARD_JUMP, FS_OPCODE_SET_GLOBAL_OFFSET, + FS_OPCODE_PACK_HALF_2x16_SPLIT, + FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, + FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, VS_OPCODE_URB_WRITE, VS_OPCODE_SCRATCH_READ, diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index b47b0d0..d332502 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -542,6 +542,14 @@ private: struct brw_reg offset); void generate_discard_jump(fs_inst *inst); + void generate_pack_half_2x16_split(fs_inst *inst, + struct brw_reg dst, + struct brw_reg x, + struct brw_reg y); + void generate_unpack_half_2x16_split(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src); + void patch_discard_jumps_to_fb_writes(); struct brw_context *brw; diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp index 58521ee..e19da51 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp @@ -342,9 +342,21 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir) assert(!"not yet supported"); break; + case ir_unop_pack_snorm_2x16: + case ir_unop_pack_unorm_2x16: + case ir_unop_pack_half_2x16: + case ir_unop_unpack_snorm_2x16: + case ir_unop_unpack_unorm_2x16: + case ir_unop_unpack_half_2x16: case ir_quadop_vector: assert(!"should have been lowered"); break; + + case ir_unop_unpack_half_2x16_split_x: + case ir_unop_unpack_half_2x16_split_y: + case ir_binop_pack_half_2x16_split: + assert("!not reached: expression operates on scalars only"); + break; } ir->remove(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp index 324e665..27c5302 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp @@ -922,6 +922,95 @@ fs_generator::generate_set_global_offset(fs_inst *inst, brw_pop_insn_state(p); } +/** + * Change the register's data type from UD to W, doubling the strides in order + * to compensate for halving the data type width. + */ +static struct brw_reg +ud_reg_to_w(struct brw_reg r) +{ + assert(r.type == BRW_REGISTER_TYPE_UD); + r.type = BRW_REGISTER_TYPE_W; + + /* The BRW_*_STRIDE enums are defined so that incrementing the field + * doubles the real stride. + */ + if (r.hstride != 0) + ++r.hstride; + if (r.vstride != 0) + ++r.vstride; + + return r; +} + +void +fs_generator::generate_pack_half_2x16_split(fs_inst *inst, + struct brw_reg dst, + struct brw_reg x, + struct brw_reg y) +{ + assert(intel->gen >= 7); + assert(dst.type == BRW_REGISTER_TYPE_UD); + assert(x.type = BRW_REGISTER_TYPE_F); + assert(y.type = BRW_REGISTER_TYPE_F); + + /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: + * + * Because this instruction does not have a 16-bit floating-point type, + * the destination data type must be Word (W). + * + * The destination must be DWord-aligned and specify a horizontal stride + * (HorzStride) of 2. The 16-bit result is stored in the lower word of + * each destination channel and the upper word is not modified. + */ + struct brw_reg dst_w = ud_reg_to_w(dst); + + /* Give each 32-bit channel of dst the form below , where "." means + * unchanged. + * 0x....hhhh + */ + brw_F32TO16(p, dst_w, y); + + /* Now the form: + * 0xhhhh0000 + */ + brw_SHL(p, dst, dst, brw_imm_ud(16u)); + + /* And, finally the form of packHalf2x16's output: + * 0xhhhhllll + */ + brw_F32TO16(p, dst_w, x); +} + +void +fs_generator::generate_unpack_half_2x16_split(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src) +{ + assert(intel->gen >= 7); + assert(dst.type == BRW_REGISTER_TYPE_F); + assert(src.type == BRW_REGISTER_TYPE_UD); + + /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: + * + * Because this instruction does not have a 16-bit floating-point type, + * the source data type must be Word (W). The destination type must be + * F (Float). + */ + struct brw_reg src_w = ud_reg_to_w(src); + + /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll. + * For the Y case, we wish to access only the upper word; therefore + * a 16-bit subregister offset is needed. + */ + assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X || + inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y); + if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y) + src.subnr += 2; + + brw_F16TO32(p, dst, src_w); +} + void fs_generator::generate_code(exec_list *instructions) { @@ -1082,7 +1171,12 @@ fs_generator::generate_code(exec_list *instructions) case BRW_OPCODE_SHL: brw_SHL(p, dst, src[0], src[1]); break; - + case BRW_OPCODE_F32TO16: + brw_F32TO16(p, dst, src[0]); + break; + case BRW_OPCODE_F16TO32: + brw_F16TO32(p, dst, src[0]); + break; case BRW_OPCODE_CMP: brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); break; @@ -1229,6 +1323,15 @@ fs_generator::generate_code(exec_list *instructions) generate_set_global_offset(inst, dst, src[0], src[1]); break; + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + generate_pack_half_2x16_split(inst, dst, src[0], src[1]); + break; + + case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X: + case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y: + generate_unpack_half_2x16_split(inst, dst, src[0]); + break; + default: if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) { _mesa_problem(ctx, "Unsupported opcode `%s' in FS", diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index ebb37fd..2b1332f 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -538,7 +538,20 @@ fs_visitor::visit(ir_expression *ir) BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE, this->result, op[0], op[1]); break; - + case ir_unop_pack_snorm_2x16: + case ir_unop_pack_unorm_2x16: + case ir_unop_unpack_snorm_2x16: + case ir_unop_unpack_unorm_2x16: + case ir_unop_unpack_half_2x16: + case ir_unop_pack_half_2x16: + assert(!"not reached: should be handled by lower_packing_builtins"); + break; + case ir_unop_unpack_half_2x16_split_x: + emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]); + break; + case ir_unop_unpack_half_2x16_split_y: + emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]); + break; case ir_binop_pow: emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]); break; @@ -566,7 +579,9 @@ fs_visitor::visit(ir_expression *ir) else inst = emit(SHR(this->result, op[0], op[1])); break; - + case ir_binop_pack_half_2x16_split: + emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]); + break; case ir_binop_ubo_load: /* This IR node takes a constant uniform block and a constant or * variable byte offset within the block and loads a vector from that. -- 2.7.4