From b3517a4df213e926c63e4990175313a2963e5611 Mon Sep 17 00:00:00 2001 From: David Schleef Date: Tue, 29 Jun 2010 19:43:50 -0700 Subject: [PATCH] sse: handle NANs and denormals Set the DAZ and FTZ flags in the MXCSR, to set proper denormal behavior. Implement NAN handling for maxf and minf. --- orc/orcprogram-sse.c | 3 +++ orc/orcprogram.h | 4 +++- orc/orcrules-sse.c | 48 +++++++++++++++++++++++++++++++++++++++++-- orc/orcsse.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++ orc/orcsse.h | 3 +++ orc/orcx86.c | 1 + orc/orcx86.h | 1 + 7 files changed, 114 insertions(+), 3 deletions(-) diff --git a/orc/orcprogram-sse.c b/orc/orcprogram-sse.c index 25c1f3c..c225296 100644 --- a/orc/orcprogram-sse.c +++ b/orc/orcprogram-sse.c @@ -593,6 +593,8 @@ orc_compiler_sse_assemble (OrcCompiler *compiler) orc_x86_emit_prologue (compiler); + orc_sse_set_mxcsr (compiler); + sse_load_constants_outer (compiler); if (compiler->program->is_2d) { @@ -746,6 +748,7 @@ orc_compiler_sse_assemble (OrcCompiler *compiler) sse_save_accumulators (compiler); + orc_sse_restore_mxcsr (compiler); orc_x86_emit_epilogue (compiler); orc_x86_do_fixups (compiler); diff --git a/orc/orcprogram.h b/orc/orcprogram.h index 9dd2912..4aa1611 100644 --- a/orc/orcprogram.h +++ b/orc/orcprogram.h @@ -74,7 +74,9 @@ typedef void (*OrcExecutorFunc)(OrcExecutor *ex); enum { ORC_TARGET_C_C99 = (1<<0), ORC_TARGET_C_BARE = (1<<1), - ORC_TARGET_C_NOEXEC = (1<<2) + ORC_TARGET_C_NOEXEC = (1<<2), + ORC_TARGET_FAST_NAN = (1<<30), + ORC_TARGET_FAST_DENORMAL = (1<<31) }; enum { diff --git a/orc/orcrules-sse.c b/orc/orcrules-sse.c index 4fd8b19..93fd25e 100644 --- a/orc/orcrules-sse.c +++ b/orc/orcrules-sse.c @@ -955,11 +955,55 @@ BINARY_F(addf, "addps", 0x58) BINARY_F(subf, "subps", 0x5c) BINARY_F(mulf, "mulps", 0x59) BINARY_F(divf, "divps", 0x5e) -BINARY_F(maxf, "maxps", 0x5f) -BINARY_F(minf, "minps", 0x5d) UNARY_F(sqrtf, "sqrtps", 0x51) static void +sse_rule_minf (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + if (p->target_flags & ORC_TARGET_FAST_NAN) { + orc_sse_emit_0f (p, "minps", 0x5d, + p->vars[insn->src_args[1]].alloc, + p->vars[insn->dest_args[0]].alloc); + } else { + orc_sse_emit_movdqa (p, + p->vars[insn->src_args[1]].alloc, + p->tmpreg); + orc_sse_emit_0f (p, "minps", 0x5d, + p->vars[insn->dest_args[0]].alloc, + p->tmpreg); + orc_sse_emit_0f (p, "minps", 0x5d, + p->vars[insn->src_args[1]].alloc, + p->vars[insn->dest_args[0]].alloc); + orc_sse_emit_por (p, + p->tmpreg, + p->vars[insn->dest_args[0]].alloc); + } +} + +static void +sse_rule_maxf (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + if (p->target_flags & ORC_TARGET_FAST_NAN) { + orc_sse_emit_0f (p, "maxps", 0x5f, + p->vars[insn->src_args[1]].alloc, + p->vars[insn->dest_args[0]].alloc); + } else { + orc_sse_emit_movdqa (p, + p->vars[insn->src_args[1]].alloc, + p->tmpreg); + orc_sse_emit_0f (p, "maxps", 0x5f, + p->vars[insn->dest_args[0]].alloc, + p->tmpreg); + orc_sse_emit_0f (p, "maxps", 0x5f, + p->vars[insn->src_args[1]].alloc, + p->vars[insn->dest_args[0]].alloc); + orc_sse_emit_por (p, + p->tmpreg, + p->vars[insn->dest_args[0]].alloc); + } +} + +static void sse_rule_cmpeqf (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_0f (p, "cmpeqps", 0xc2, diff --git a/orc/orcsse.c b/orc/orcsse.c index d250950..2647dad 100644 --- a/orc/orcsse.c +++ b/orc/orcsse.c @@ -275,3 +275,60 @@ void orc_x86_emit_mov_sse_reg (OrcCompiler *compiler, int reg1, int reg2) orc_x86_emit_modrm_reg (compiler, reg2, reg1); } +void +orc_sse_set_mxcsr (OrcCompiler *compiler) +{ + int value; + + ORC_ASM_CODE(compiler," stmxcsr %d(%%%s)\n", + (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A4]), + orc_x86_get_regname(compiler->exec_reg)); + *compiler->codeptr++ = 0x0f; + *compiler->codeptr++ = 0xae; + orc_x86_emit_modrm_memoffset (compiler, 3, + (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A4]), compiler->exec_reg); + + orc_x86_emit_mov_memoffset_reg (compiler, 4, + (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A4]), + compiler->exec_reg, compiler->gp_tmpreg); + + orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_C1]), + compiler->exec_reg); + + value = 0x8040; + ORC_ASM_CODE(compiler," orl $%d, %%%s\n", value, + orc_x86_get_regname(compiler->gp_tmpreg)); + orc_x86_emit_rex(compiler, 4, 0, 0, compiler->gp_tmpreg); + *compiler->codeptr++ = 0x81; + orc_x86_emit_modrm_reg (compiler, compiler->gp_tmpreg, 1); + *compiler->codeptr++ = (value & 0xff); + *compiler->codeptr++ = ((value>>8) & 0xff); + *compiler->codeptr++ = ((value>>16) & 0xff); + *compiler->codeptr++ = ((value>>24) & 0xff); + + orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A4]), + compiler->exec_reg); + + ORC_ASM_CODE(compiler," ldmxcsr %d(%%%s)\n", + (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A4]), + orc_x86_get_regname(compiler->exec_reg)); + *compiler->codeptr++ = 0x0f; + *compiler->codeptr++ = 0xae; + orc_x86_emit_modrm_memoffset (compiler, 2, + (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A4]), compiler->exec_reg); +} + +void +orc_sse_restore_mxcsr (OrcCompiler *compiler) +{ + ORC_ASM_CODE(compiler," ldmxcsr %d(%%%s)\n", + (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_C1]), + orc_x86_get_regname(compiler->exec_reg)); + *compiler->codeptr++ = 0x0f; + *compiler->codeptr++ = 0xae; + orc_x86_emit_modrm_memoffset (compiler, 2, + (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_C1]), compiler->exec_reg); +} + diff --git a/orc/orcsse.h b/orc/orcsse.h index 15b93b6..f991f79 100644 --- a/orc/orcsse.h +++ b/orc/orcsse.h @@ -67,6 +67,9 @@ void orc_sse_emit_pshuflw (OrcCompiler *p, int shuf, int src, int dest); void orc_sse_emit_shiftimm (OrcCompiler *p, const char *insn_name, int code, int modrm_code, int shift, int reg); +void orc_sse_set_mxcsr (OrcCompiler *compiler); +void orc_sse_restore_mxcsr (OrcCompiler *compiler); + unsigned int orc_sse_get_cpu_flags (void); /* SSE instructions */ diff --git a/orc/orcx86.c b/orc/orcx86.c index b89f364..3c10505 100644 --- a/orc/orcx86.c +++ b/orc/orcx86.c @@ -897,6 +897,7 @@ orc_x86_emit_prologue (OrcCompiler *compiler) orc_x86_emit_push (compiler, 4, X86_EBX); } } + } void diff --git a/orc/orcx86.h b/orc/orcx86.h index bafc0b3..4358d20 100644 --- a/orc/orcx86.h +++ b/orc/orcx86.h @@ -43,6 +43,7 @@ void orc_x86_emit_add_reg_memoffset (OrcCompiler *compiler, int size, int reg1, void orc_x86_emit_and_imm_memoffset (OrcCompiler *compiler, int size, int value, int offset, int reg); void orc_x86_emit_add_imm_reg (OrcCompiler *compiler, int size, int value, int reg, orc_bool record); void orc_x86_emit_and_imm_reg (OrcCompiler *compiler, int size, int value, int reg); +void orc_x86_emit_or_imm_reg (OrcCompiler *compiler, int size, int value, int reg); void orc_x86_emit_add_reg_reg (OrcCompiler *compiler, int size, int reg1, int reg2); void orc_x86_emit_sub_reg_reg (OrcCompiler *compiler, int size, int reg1, int reg2); void orc_x86_emit_imul_memoffset_reg (OrcCompiler *compiler, int size, -- 2.7.4