From aa2709c4e07890d2c1d20e7c5da76fd885e84f9d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 26 Jun 2014 12:22:29 +0200 Subject: [PATCH] enabled optimized dgemm kernel for NEHALEM --- kernel/x86_64/KERNEL.NEHALEM | 23 +++++++++++------------ lapack-netlib/TESTING/nep.in | 2 +- param.h | 4 ++-- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 665d2d6..736e419 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -13,13 +13,13 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = gemm_kernel_4x4_core2.S -DGEMMINCOPY = -DGEMMITCOPY = -DGEMMONCOPY = gemm_ncopy_4.S -DGEMMOTCOPY = gemm_tcopy_4.S -DGEMMINCOPYOBJ = -DGEMMITCOPYOBJ = +DGEMMKERNEL = gemm_kernel_2x8_nehalem.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -48,11 +48,10 @@ STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S -DTRSMKERNEL_LN = trsm_kernel_LN_4x4_core2.S -DTRSMKERNEL_LT = trsm_kernel_LT_4x4_core2.S -DTRSMKERNEL_RN = trsm_kernel_LT_4x4_core2.S -DTRSMKERNEL_RT = trsm_kernel_RT_4x4_core2.S - +DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S diff --git a/lapack-netlib/TESTING/nep.in b/lapack-netlib/TESTING/nep.in index c4a4149..12e3ea4 100644 --- a/lapack-netlib/TESTING/nep.in +++ b/lapack-netlib/TESTING/nep.in @@ -10,7 +10,7 @@ NEP: Data file for testing Nonsymmetric Eigenvalue Problem routines 0 5 7 3 200 Values of INIBL (nibble crossover point) 1 2 4 2 1 Values of ISHFTS (number of simultaneous shifts) 0 1 2 0 1 Values of IACC22 (select structured matrix multiply: 0, 1 or 2) -20.0 Threshold value +70.0 Threshold value T Put T to test the error exits 1 Code to interpret the seed NEP 21 diff --git a/param.h b/param.h index 995f905..80e73ca 100644 --- a/param.h +++ b/param.h @@ -1032,14 +1032,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_UNROLL_N 1 #else #define SGEMM_DEFAULT_UNROLL_M 4 -#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 8 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -- 2.7.4