From 12b8d542b7465f495681d3dc0c50cd27f7e0ee94 Mon Sep 17 00:00:00 2001 From: Jojo R Date: Tue, 17 Nov 2020 15:29:03 +0800 Subject: [PATCH] norm.cpp(normL2Sqr_): improve performance of pipeline The most of target machine use one type cpu unit resource to execute some one type of instruction, e.g. all vx_load API use load/store cpu unit, and v_muladd API use mul/mula cpu unit, we interleave vx_load and v_muladd to improve performance on most targets like RISCV or ARM. --- modules/core/src/norm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.cpp index 9aaed8e..b95cd99 100644 --- a/modules/core/src/norm.cpp +++ b/modules/core/src/norm.cpp @@ -152,10 +152,10 @@ float normL2Sqr_(const float* a, const float* b, int n) { v_float32 t0 = vx_load(a + j) - vx_load(b + j); v_float32 t1 = vx_load(a + j + v_float32::nlanes) - vx_load(b + j + v_float32::nlanes); - v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes); - v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes); v_d0 = v_muladd(t0, t0, v_d0); + v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes); v_d1 = v_muladd(t1, t1, v_d1); + v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes); v_d2 = v_muladd(t2, t2, v_d2); v_d3 = v_muladd(t3, t3, v_d3); } -- 2.7.4