From 4d3fb1961cd065dfc835e1fc1a03918daf7bf134 Mon Sep 17 00:00:00 2001 From: Doug Nazar Date: Fri, 20 Sep 2019 04:56:35 -0400 Subject: [PATCH] neon: Enable Flush-to-zero mode for float programs Some of the instructions are not NEON (which always uses FTZ), but are actually VFP, which requires enabling FTZ mode. --- orc/orcprogram-neon.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/orc/orcprogram-neon.c b/orc/orcprogram-neon.c index 05584a0..cecbcc0 100644 --- a/orc/orcprogram-neon.c +++ b/orc/orcprogram-neon.c @@ -476,6 +476,7 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) int align_shift; int var_size_shift; int i; + int set_fpscr = FALSE; align_var = get_align_var (compiler); if (compiler->error) return; @@ -487,6 +488,18 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) orc_neon_emit_prologue (compiler); + if (orc_program_has_float (compiler)) { + set_fpscr = TRUE; + ORC_ASM_CODE (compiler," vmrs %s, fpscr\n", orc_arm_reg_name (compiler->gp_tmpreg)); + orc_arm_emit (compiler, 0xeef10a10 | ((compiler->gp_tmpreg&0xf)<<12)); + ORC_ASM_CODE (compiler," push %s\n", orc_arm_reg_name (compiler->gp_tmpreg)); + orc_arm_emit (compiler, 0xe52d0004 | ((compiler->gp_tmpreg&0xf)<<12)); + + orc_arm_emit_load_imm (compiler, compiler->gp_tmpreg, 1<<24); + ORC_ASM_CODE (compiler," vmsr fpscr, %s\n", orc_arm_reg_name (compiler->gp_tmpreg)); + orc_arm_emit (compiler, 0xeee10a10 | ((compiler->gp_tmpreg&0xf)<<12)); + } + orc_neon_load_constants_outer (compiler); if (compiler->program->is_2d) { @@ -697,6 +710,14 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) orc_neon_save_accumulators (compiler); + if (set_fpscr) { + ORC_ASM_CODE (compiler," pop %s\n", orc_arm_reg_name (compiler->gp_tmpreg)); + orc_arm_emit (compiler, 0xe49d0004 | ((compiler->gp_tmpreg&0xf)<<12)); + + ORC_ASM_CODE (compiler," vmsr fpscr, %s\n", orc_arm_reg_name (compiler->gp_tmpreg)); + orc_arm_emit (compiler, 0xeee10a10 | ((compiler->gp_tmpreg&0xf)<<12)); + } + orc_neon_emit_epilogue (compiler); orc_arm_emit_align (compiler, 4); -- 2.7.4