From dd4c1dd0382277b080fb4981e027250e10658ae8 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Wed, 18 May 2005 12:26:21 +0000 Subject: [PATCH] Generates working SSE code for gears under the swrast driver. --- src/mesa/tnl/t_vertex_sse.c | 185 ++++++++++++++++++++++++++++++-------------- 1 file changed, 127 insertions(+), 58 deletions(-) diff --git a/src/mesa/tnl/t_vertex_sse.c b/src/mesa/tnl/t_vertex_sse.c index b4e2c5b..1771baa 100644 --- a/src/mesa/tnl/t_vertex_sse.c +++ b/src/mesa/tnl/t_vertex_sse.c @@ -42,6 +42,8 @@ #define Z 2 #define W 3 +#define DISASSEM 1 + struct x86_reg { GLuint file:3; GLuint idx:3; @@ -144,6 +146,17 @@ static struct x86_reg make_disp( struct x86_reg reg, return reg; } +static struct x86_reg deref( struct x86_reg reg ) +{ + return make_disp(reg, 0); +} + +static struct x86_reg get_base_reg( struct x86_reg reg ) +{ + return make_reg( reg.file, reg.idx ); +} + + /* Retreive a reference to one of the function arguments, taking into * account any push/pop activity: */ @@ -179,29 +192,47 @@ static void emit_1b( struct x86_program *p, GLbyte b0 ) *(GLbyte *)(p->csr++) = b0; } -static void emit_1ub( struct x86_program *p, GLubyte b0 ) +static void emit_1i( struct x86_program *p, GLint i0 ) +{ + *(GLint *)(p->csr) = i0; + p->csr += 4; +} + +static void disassem( struct x86_program *p, const char *fn ) +{ +#if DISASSEM + static const char *last_fn; + if (fn && fn != last_fn) { + _mesa_printf("0x%x: %s\n", p->csr, fn); + last_fn = fn; + } +#endif +} + +static void emit_1ub_fn( struct x86_program *p, GLubyte b0, const char *fn ) { + disassem(p, fn); *(p->csr++) = b0; } -static void emit_2ub( struct x86_program *p, GLubyte b0, GLubyte b1 ) +static void emit_2ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, const char *fn ) { + disassem(p, fn); *(p->csr++) = b0; *(p->csr++) = b1; } -static void emit_3ub( struct x86_program *p, GLubyte b0, GLubyte b1, GLubyte b2 ) +static void emit_3ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, GLubyte b2, const char *fn ) { + disassem(p, fn); *(p->csr++) = b0; *(p->csr++) = b1; *(p->csr++) = b2; } -static void emit_1i( struct x86_program *p, GLint i0 ) -{ - *(GLint *)(p->csr) = i0; - p->csr += 4; -} +#define emit_1ub(p, b0) emit_1ub_fn(p, b0, __FUNCTION__) +#define emit_2ub(p, b0, b1) emit_2ub_fn(p, b0, b1, __FUNCTION__) +#define emit_3ub(p, b0, b1, b2) emit_3ub_fn(p, b0, b1, b2, __FUNCTION__) /* Labels, jumps and fixup: @@ -216,7 +247,7 @@ static void emit_jcc( struct x86_program *p, GLubyte *label ) { GLint offset = label - (get_label(p) + 2); - + if (offset <= 127 && offset >= -128) { emit_1ub(p, 0x70 + cc); emit_1b(p, (GLbyte) offset); @@ -273,7 +304,7 @@ static void emit_dec( struct x86_program *p, struct x86_reg reg ) { assert(reg.mod == mod_REG); - emit_1ub(p, 0x40 + reg.idx); + emit_1ub(p, 0x48 + reg.idx); } static void emit_ret( struct x86_program *p ) @@ -299,7 +330,13 @@ static void emit_modrm( struct x86_program *p, val |= reg.idx << 3; /* reg field */ val |= regmem.idx; /* r/m field */ - emit_1ub(p, val); + emit_1ub_fn(p, val, 0); + + /* Oh-oh we've stumbled into the SIB thing. + */ + if (regmem.idx == reg_SP) { + emit_1ub_fn(p, 0x24, 0); /* simplistic! */ + } switch (regmem.mod) { case mod_REG: @@ -307,8 +344,10 @@ static void emit_modrm( struct x86_program *p, break; case mod_DISP8: emit_1b(p, regmem.disp); + break; case mod_DISP32: emit_1i(p, regmem.disp); + break; } } @@ -325,14 +364,14 @@ static void emit_op_modrm( struct x86_program *p, { switch (dst.mod) { case mod_REG: - emit_1ub(p, op_dst_is_reg); + emit_1ub_fn(p, op_dst_is_reg, 0); emit_modrm(p, dst, src); break; case mod_INDIRECT: case mod_DISP32: case mod_DISP8: assert(src.mod == mod_REG); - emit_1ub(p, op_dst_is_mem); + emit_1ub_fn(p, op_dst_is_mem, 0); emit_modrm(p, src, dst); break; } @@ -352,6 +391,13 @@ static void emit_xor( struct x86_program *p, emit_op_modrm( p, 0x33, 0x31, dst, src ); } +static void emit_cmp( struct x86_program *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_op_modrm( p, 0x3b, 0x39, dst, src ); +} + static void emit_movlps( struct x86_program *p, struct x86_reg dst, struct x86_reg src ) @@ -443,6 +489,14 @@ static void emit_packsswb( struct x86_program *p, emit_modrm( p, dst, src ); } +static void emit_packuswb( struct x86_program *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_3ub(p, 0x66, X86_TWOB, 0x67); + emit_modrm( p, dst, src ); +} + /* Load effective address: */ static void emit_lea( struct x86_program *p, @@ -461,6 +515,14 @@ static void emit_add_imm( struct x86_program *p, emit_lea(p, dst, make_disp(src, value)); } +static void emit_test( struct x86_program *p, + struct x86_reg dst, + struct x86_reg src ) +{ + emit_1ub(p, 0x85); + emit_modrm( p, dst, src ); +} + @@ -487,7 +549,7 @@ static void emit_pk4ub( struct x86_program *p, { emit_cvtps2dq(p, dest, arg0); emit_packssdw(p, dest, dest); - emit_packsswb(p, dest, dest); + emit_packuswb(p, dest, dest); } static void emit_load4f_4( struct x86_program *p, @@ -620,12 +682,12 @@ static void (*load[4][4])( struct x86_program *p, }; static void emit_load( struct x86_program *p, - struct x86_reg temp, + struct x86_reg dest, GLuint sz, struct x86_reg src, GLuint src_sz) { - load[sz-1][src_sz-1](p, temp, src); + load[sz-1][src_sz-1](p, dest, src); } @@ -721,6 +783,7 @@ static GLboolean build_vertex_emit( struct x86_program *p ) struct x86_reg tmp = make_reg(file_XMM, 0); struct x86_reg vp0 = make_reg(file_XMM, 1); struct x86_reg vp1 = make_reg(file_XMM, 2); + struct x86_reg chan0 = make_reg(file_XMM, 3); GLubyte *fixup, *label; p->csr = p->store; @@ -731,6 +794,15 @@ static GLboolean build_vertex_emit( struct x86_program *p ) emit_push(p, countEBP); emit_push(p, vtxESI); + + /* Get vertex count, compare to zero + */ + emit_xor(p, srcEDI, srcEDI); + emit_mov(p, countEBP, make_fn_arg(p, 2)); + emit_cmp(p, countEBP, srcEDI); + fixup = emit_jcc_forward(p, cc_E); + + /* Initialize destination register. */ emit_mov(p, vertexEAX, make_fn_arg(p, 3)); @@ -741,10 +813,6 @@ static GLboolean build_vertex_emit( struct x86_program *p ) emit_mov(p, vtxESI, make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context))); vtxESI = make_disp(vtxESI, get_offset(tnl, &tnl->clipspace)); - /* Get vertex count, compare to zero - */ - emit_mov(p, countEBP, make_fn_arg(p, 2)); - fixup = emit_jcc_forward(p, cc_NZ); /* Possibly load vp0, vp1 for viewport calcs: */ @@ -753,6 +821,10 @@ static GLboolean build_vertex_emit( struct x86_program *p ) emit_movups(p, vp1, make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0]))); } + /* always load, needed or not: + */ + emit_movups(p, chan0, make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0]))); + /* Note address for loop jump */ label = get_label(p); @@ -775,40 +847,40 @@ static GLboolean build_vertex_emit( struct x86_program *p ) */ switch (a[j].format) { case EMIT_1F: - emit_load(p, tmp, 1, srcEDI, vtx->attr[j].inputsize); + emit_load(p, tmp, 1, deref(srcEDI), vtx->attr[j].inputsize); emit_store(p, dest, 1, tmp); case EMIT_2F: - emit_load(p, tmp, 2, srcEDI, vtx->attr[j].inputsize); + emit_load(p, tmp, 2, deref(srcEDI), vtx->attr[j].inputsize); emit_store(p, dest, 2, tmp); case EMIT_3F: /* Potentially the worst case - hardcode 2+1 copying: */ - emit_load(p, tmp, 3, srcEDI, vtx->attr[j].inputsize); + emit_load(p, tmp, 3, deref(srcEDI), vtx->attr[j].inputsize); emit_store(p, dest, 3, tmp); case EMIT_4F: - emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); + emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); emit_store(p, dest, 4, tmp); break; case EMIT_2F_VIEWPORT: - emit_load(p, tmp, 2, srcEDI, vtx->attr[j].inputsize); - emit_mulps(p, dest, vp0); - emit_addps(p, dest, vp1); + emit_load(p, tmp, 2, deref(srcEDI), vtx->attr[j].inputsize); + emit_mulps(p, tmp, vp0); + emit_addps(p, tmp, vp1); emit_store(p, dest, 2, tmp); break; case EMIT_3F_VIEWPORT: - emit_load(p, tmp, 3, srcEDI, vtx->attr[j].inputsize); - emit_mulps(p, dest, vp0); - emit_addps(p, dest, vp1); + emit_load(p, tmp, 3, deref(srcEDI), vtx->attr[j].inputsize); + emit_mulps(p, tmp, vp0); + emit_addps(p, tmp, vp1); emit_store(p, dest, 3, tmp); break; case EMIT_4F_VIEWPORT: - emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); - emit_mulps(p, dest, vp0); - emit_addps(p, dest, vp1); + emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); + emit_mulps(p, tmp, vp0); + emit_addps(p, tmp, vp1); emit_store(p, dest, 4, tmp); break; case EMIT_3F_XYW: - emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); + emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); emit_pshufd(p, tmp, tmp, X, Y, W, Z); emit_store(p, dest, 3, tmp); break; @@ -818,48 +890,56 @@ static GLboolean build_vertex_emit( struct x86_program *p ) case EMIT_1UB_1F: case EMIT_3UB_3F_RGB: case EMIT_3UB_3F_BGR: + _mesa_printf("non-implemneted format %d\n", a[j].format); return GL_FALSE; /* add this later */ case EMIT_4UB_4F_RGBA: - emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); + emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); + emit_mulps(p, tmp, chan0); emit_pk4ub(p, tmp, tmp); emit_store(p, dest, 1, tmp); break; case EMIT_4UB_4F_BGRA: - emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); + emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); emit_pshufd(p, tmp, tmp, Z, Y, X, W); + emit_mulps(p, tmp, chan0); emit_pk4ub(p, tmp, tmp); emit_store(p, dest, 1, tmp); break; case EMIT_4UB_4F_ARGB: - emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); + emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); emit_pshufd(p, tmp, tmp, W, X, Y, Z); + emit_mulps(p, tmp, chan0); emit_pk4ub(p, tmp, tmp); emit_store(p, dest, 1, tmp); break; case EMIT_4UB_4F_ABGR: - emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); + emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); emit_pshufd(p, tmp, tmp, W, Z, Y, X); + emit_mulps(p, tmp, chan0); emit_pk4ub(p, tmp, tmp); emit_store(p, dest, 1, tmp); break; case EMIT_4CHAN_4F_RGBA: switch (CHAN_TYPE) { case GL_UNSIGNED_BYTE: - emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); + emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); + emit_mulps(p, tmp, chan0); emit_pk4ub(p, tmp, tmp); emit_store(p, dest, 1, tmp); break; - case GL_UNSIGNED_SHORT: - return GL_FALSE; case GL_FLOAT: - emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); + emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); emit_store(p, dest, 4, tmp); break; + case GL_UNSIGNED_SHORT: default: - break; + _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE)); + return GL_FALSE; } + break; default: + _mesa_printf("unknown a[%d].format %d\n", j, a[j].format); return GL_FALSE; /* catch any new opcodes */ } @@ -881,6 +961,7 @@ static GLboolean build_vertex_emit( struct x86_program *p ) /* decr count, loop if not zero */ emit_dec(p, countEBP); + emit_test(p, countEBP, countEBP); emit_jcc(p, cc_NZ, label); /* Land forward jump here: @@ -889,7 +970,7 @@ static GLboolean build_vertex_emit( struct x86_program *p ) /* Pop regs and return */ - emit_pop(p, vtxESI); + emit_pop(p, get_base_reg(vtxESI)); emit_pop(p, countEBP); emit_pop(p, srcEDI); emit_ret(p); @@ -912,20 +993,8 @@ void _tnl_generate_sse_emit( GLcontext *ctx ) if (build_vertex_emit(&p)) { _tnl_register_fastpath( vtx, GL_TRUE ); - - { - static int i = 0; - char filename[100]; - int fd; - - sprintf(filename, "fastpath%d.o", i); - fd = creat(filename, 0600); - if (fd != -1) { - write(fd, p.store, p.csr - p.store); - close(fd); - _mesa_printf("wrote %s\n", filename); - } - } + if (DISASSEM) + _mesa_printf("disassemble 0x%x 0x%x\n", p.store, p.csr); } else { FREE(p.store); -- 2.7.4