From 2cae9f2d4a57dc9cca934b489df43f0ec4eb98bc Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 5 Dec 2012 16:19:43 -0800 Subject: [PATCH] i965/fs: Add empirically-determined instruction latencies for gen7. v2: Actually switch on the other math instructions mentioned in the comment. v3: Add timing data for textureSize(), and clean up some long comment lines. Testing shader_time of fs16 shaders on a few frames of various apps: nexuiz improved by 2.9% +/- 1.5% (n=10) no difference on GLB2.5 (n=36, outliers removed) no difference on GLB2.7 (n=25) etqw improved by 2.6% +/- 2.2% (n=25) no difference on lightsmark (n=25) Acked-by: Kenneth Graunke --- .../dri/i965/brw_fs_schedule_instructions.cpp | 182 ++++++++++++++++++++- 1 file changed, 179 insertions(+), 3 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp index 458854c..3fbca6c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp @@ -57,7 +57,7 @@ static bool debug = false; class schedule_node : public exec_node { public: - schedule_node(fs_inst *inst) + schedule_node(fs_inst *inst, int gen) { this->inst = inst; this->child_array_size = 0; @@ -67,10 +67,14 @@ public: this->parent_count = 0; this->unblocked_time = 0; - set_latency_gen4(); + if (gen >= 7) + set_latency_gen7(); + else + set_latency_gen4(); } void set_latency_gen4(); + void set_latency_gen7(); fs_inst *inst; schedule_node **children; @@ -120,6 +124,178 @@ schedule_node::set_latency_gen4() } } +void +schedule_node::set_latency_gen7() +{ + switch (inst->opcode) { + case BRW_OPCODE_MAD: + /* 3 cycles (this is said to be 4 cycles sometimes depending on the + * register numbers in the sources): + * mad(8) g4<1>F g2.2<4,1,1>F.x g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q }; + * + * 20 cycles: + * mad(8) g4<1>F g2.2<4,1,1>F.x g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q }; + * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; + */ + latency = 17; + break; + + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + /* 2 cycles: + * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; + * + * 18 cycles: + * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * Same for exp2, log2, rsq, sqrt, sin, cos. + */ + latency = 16; + break; + + case SHADER_OPCODE_POW: + /* 2 cycles: + * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; + * + * 26 cycles: + * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + */ + latency = 24; + break; + + case SHADER_OPCODE_TEX: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXL: + /* 18 cycles: + * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; + * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * + * 697 +/-49 cycles (min 610, n=26): + * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; + * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * So the latency on our first texture load of the batchbuffer takes + * ~700 cycles, since the caches are cold at that point. + * + * 840 +/- 92 cycles (min 720, n=25): + * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; + * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * On the second load, it takes just an extra ~140 cycles, and after + * accounting for the 14 cycles of the MOV's latency, that makes ~130. + * + * 683 +/- 49 cycles (min = 602, n=47): + * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; + * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * send(8) g50<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * The unit appears to be pipelined, since this matches up with the + * cache-cold case, despite there being two loads here. If you replace + * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39). + * + * So, take some number between the cache-hot 140 cycles and the + * cache-cold 700 cycles. No particular tuning was done on this. + * + * I haven't done significant testing of the non-TEX opcodes. TXL at + * least looked about the same as TEX. + */ + latency = 200; + break; + + case SHADER_OPCODE_TXS: + /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41 + * cycles (n=15): + * mov(8) g114<1>UD 0D { align1 WE_normal 1Q }; + * send(8) g6<1>UW g114<8,8,1>F + * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q }; + * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q }; + * + * + * Two loads was 535 +/- 30 cycles (n=19): + * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; + * send(16) g6<1>UW g114<8,8,1>F + * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; + * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; + * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H }; + * send(16) g8<1>UW g114<8,8,1>F + * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; + * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H }; + * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H }; + * + * Since the only caches that should matter are just the + * instruction/state cache containing the surface state, assume that we + * always have hot caches. + */ + latency = 100; + break; + + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD: + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + /* testing using varying-index pull constants: + * + * 16 cycles: + * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; + * send(8) g4<1>F g4<8,8,1>D + * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; + * + * ~480 cycles: + * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; + * send(8) g4<1>F g4<8,8,1>D + * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * ~620 cycles: + * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; + * send(8) g4<1>F g4<8,8,1>D + * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * send(8) g4<1>F g4<8,8,1>D + * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * So, if it's cache-hot, it's about 140. If it's cache cold, it's + * about 460. We expect to mostly be cache hot, so pick something more + * in that direction. + */ + latency = 200; + break; + + default: + /* 2 cycles: + * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; + * + * 16 cycles: + * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + */ + latency = 14; + break; + } +} + class instruction_scheduler { public: instruction_scheduler(fs_visitor *v, void *mem_ctx, int grf_count, @@ -159,7 +335,7 @@ public: void instruction_scheduler::add_inst(fs_inst *inst) { - schedule_node *n = new(mem_ctx) schedule_node(inst); + schedule_node *n = new(mem_ctx) schedule_node(inst, v->intel->gen); assert(!inst->is_head_sentinel()); assert(!inst->is_tail_sentinel()); -- 2.7.4