From 711ca33bc6da03daf2115c7a82ae2a56f73d67a3 Mon Sep 17 00:00:00 2001 From: Martin Koehler Date: Mon, 7 Sep 2015 14:33:26 +0200 Subject: [PATCH] Improved Ximatcopy when lda==ldb. The Ximatcopy functions create a copy of the input matrix although they seem to work inplace. The new routines XIMATCOPY_K_YY perform the operations inplace if the leading dimension does not change. --- CONTRIBUTORS.md | 3 + common_c.h | 19 +++++ common_d.h | 9 ++ common_level3.h | 24 ++++++ common_macro.h | 26 ++++++ common_param.h | 30 +++++++ common_s.h | 8 ++ common_z.h | 18 ++++ interface/imatcopy.c | 35 +++++++- interface/zimatcopy.c | 50 +++++++++++- kernel/Makefile.L3 | 181 +++++++++++++++++++++++++++++++++++++++++ kernel/generic/imatcopy_cn.c | 67 +++++++++++++++ kernel/generic/imatcopy_ct.c | 91 +++++++++++++++++++++ kernel/generic/imatcopy_rn.c | 72 ++++++++++++++++ kernel/generic/imatcopy_rt.c | 64 +++++++++++++++ kernel/generic/zimatcopy_cn.c | 67 +++++++++++++++ kernel/generic/zimatcopy_cnc.c | 67 +++++++++++++++ kernel/generic/zimatcopy_ct.c | 82 +++++++++++++++++++ kernel/generic/zimatcopy_ctc.c | 85 +++++++++++++++++++ kernel/generic/zimatcopy_rn.c | 66 +++++++++++++++ kernel/generic/zimatcopy_rnc.c | 65 +++++++++++++++ kernel/generic/zimatcopy_rt.c | 80 ++++++++++++++++++ kernel/generic/zimatcopy_rtc.c | 82 +++++++++++++++++++ 23 files changed, 1288 insertions(+), 3 deletions(-) create mode 100644 kernel/generic/imatcopy_cn.c create mode 100644 kernel/generic/imatcopy_ct.c create mode 100644 kernel/generic/imatcopy_rn.c create mode 100644 kernel/generic/imatcopy_rt.c create mode 100644 kernel/generic/zimatcopy_cn.c create mode 100644 kernel/generic/zimatcopy_cnc.c create mode 100644 kernel/generic/zimatcopy_ct.c create mode 100644 kernel/generic/zimatcopy_ctc.c create mode 100644 kernel/generic/zimatcopy_rn.c create mode 100644 kernel/generic/zimatcopy_rnc.c create mode 100644 kernel/generic/zimatcopy_rt.c create mode 100644 kernel/generic/zimatcopy_rtc.c diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index b88e367..88e461d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -127,5 +127,8 @@ In chronological order: * Ton van den Heuvel * [2015-03-18] Fix race condition during shutdown causing a crash in gotoblas_set_affinity(). +* Martin Koehler + * [2015-09-07] Improved imatcopy + * [Your name or handle] <[email or website]> * [Date] [Brief summary of your changes] diff --git a/common_c.h b/common_c.h index 741d7d0..ce0f2a5 100644 --- a/common_c.h +++ b/common_c.h @@ -220,6 +220,15 @@ #define COMATCOPY_K_CTC comatcopy_k_ctc #define COMATCOPY_K_RTC comatcopy_k_rtc +#define CIMATCOPY_K_CN cimatcopy_k_cn +#define CIMATCOPY_K_RN cimatcopy_k_rn +#define CIMATCOPY_K_CT cimatcopy_k_ct +#define CIMATCOPY_K_RT cimatcopy_k_rt +#define CIMATCOPY_K_CNC cimatcopy_k_cnc +#define CIMATCOPY_K_RNC cimatcopy_k_rnc +#define CIMATCOPY_K_CTC cimatcopy_k_ctc +#define CIMATCOPY_K_RTC cimatcopy_k_rtc + #define CGEADD_K cgeadd_k #else @@ -403,6 +412,16 @@ #define COMATCOPY_K_RNC gotoblas -> comatcopy_k_rnc #define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc #define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc + +#define CIMATCOPY_K_CN gotoblas -> cimatcopy_k_cn +#define CIMATCOPY_K_RN gotoblas -> cimatcopy_k_rn +#define CIMATCOPY_K_CT gotoblas -> cimatcopy_k_ct +#define CIMATCOPY_K_RT gotoblas -> cimatcopy_k_rt +#define CIMATCOPY_K_CNC gotoblas -> cimatcopy_k_cnc +#define CIMATCOPY_K_RNC gotoblas -> cimatcopy_k_rnc +#define CIMATCOPY_K_CTC gotoblas -> cimatcopy_k_ctc +#define CIMATCOPY_K_RTC gotoblas -> cimatcopy_k_rtc + #define CGEADD_K gotoblas -> cgeadd_k #endif diff --git a/common_d.h b/common_d.h index d6dfd7f..ad99451 100644 --- a/common_d.h +++ b/common_d.h @@ -149,6 +149,11 @@ #define DOMATCOPY_K_RN domatcopy_k_rn #define DOMATCOPY_K_CT domatcopy_k_ct #define DOMATCOPY_K_RT domatcopy_k_rt + +#define DIMATCOPY_K_CN dimatcopy_k_cn +#define DIMATCOPY_K_RN dimatcopy_k_rn +#define DIMATCOPY_K_CT dimatcopy_k_ct +#define DIMATCOPY_K_RT dimatcopy_k_rt #define DGEADD_K dgeadd_k #else @@ -267,6 +272,10 @@ #define DOMATCOPY_K_RN gotoblas -> domatcopy_k_rn #define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct #define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt +#define DIMATCOPY_K_CN gotoblas -> dimatcopy_k_cn +#define DIMATCOPY_K_RN gotoblas -> dimatcopy_k_rn +#define DIMATCOPY_K_CT gotoblas -> dimatcopy_k_ct +#define DIMATCOPY_K_RT gotoblas -> dimatcopy_k_rt #define DGEADD_K gotoblas -> dgeadd_k diff --git a/common_level3.h b/common_level3.h index e0ecbc4..1f5490b 100644 --- a/common_level3.h +++ b/common_level3.h @@ -1736,31 +1736,55 @@ int somatcopy_k_cn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLAS int somatcopy_k_rn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int somatcopy_k_ct(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int somatcopy_k_rt(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); +int simatcopy_k_cn(BLASLONG, BLASLONG, float, float *, BLASLONG); +int simatcopy_k_rn(BLASLONG, BLASLONG, float, float *, BLASLONG); +int simatcopy_k_ct(BLASLONG, BLASLONG, float, float *, BLASLONG); +int simatcopy_k_rt(BLASLONG, BLASLONG, float, float *, BLASLONG); int domatcopy_k_cn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_rn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_ct(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_rt(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); +int dimatcopy_k_cn(BLASLONG, BLASLONG, double, double *, BLASLONG); +int dimatcopy_k_rn(BLASLONG, BLASLONG, double, double *, BLASLONG); +int dimatcopy_k_ct(BLASLONG, BLASLONG, double, double *, BLASLONG); +int dimatcopy_k_rt(BLASLONG, BLASLONG, double, double *, BLASLONG); int comatcopy_k_cn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_ct(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rt(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cimatcopy_k_cn(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rn(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_ct(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rt(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int comatcopy_k_cnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_ctc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rtc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cimatcopy_k_cnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_ctc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rtc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int zomatcopy_k_cn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_ct(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rt(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zimatcopy_k_cn(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rn(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_ct(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rt(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int zomatcopy_k_cnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zimatcopy_k_cnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG); int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 8555baa..4976e76 100644 --- a/common_macro.h +++ b/common_macro.h @@ -634,6 +634,11 @@ #define OMATCOPY_K_RN DOMATCOPY_K_RN #define OMATCOPY_K_CT DOMATCOPY_K_CT #define OMATCOPY_K_RT DOMATCOPY_K_RT +#define IMATCOPY_K_CN DIMATCOPY_K_CN +#define IMATCOPY_K_RN DIMATCOPY_K_RN +#define IMATCOPY_K_CT DIMATCOPY_K_CT +#define IMATCOPY_K_RT DIMATCOPY_K_RT + #define GEADD_K DGEADD_K #else @@ -931,6 +936,10 @@ #define OMATCOPY_K_RN SOMATCOPY_K_RN #define OMATCOPY_K_CT SOMATCOPY_K_CT #define OMATCOPY_K_RT SOMATCOPY_K_RT +#define IMATCOPY_K_CN SIMATCOPY_K_CN +#define IMATCOPY_K_RN SIMATCOPY_K_RN +#define IMATCOPY_K_CT SIMATCOPY_K_CT +#define IMATCOPY_K_RT SIMATCOPY_K_RT #define GEADD_K SGEADD_K #endif @@ -1747,6 +1756,15 @@ #define OMATCOPY_K_RNC ZOMATCOPY_K_RNC #define OMATCOPY_K_CTC ZOMATCOPY_K_CTC #define OMATCOPY_K_RTC ZOMATCOPY_K_RTC +#define IMATCOPY_K_CN ZIMATCOPY_K_CN +#define IMATCOPY_K_RN ZIMATCOPY_K_RN +#define IMATCOPY_K_CT ZIMATCOPY_K_CT +#define IMATCOPY_K_RT ZIMATCOPY_K_RT +#define IMATCOPY_K_CNC ZIMATCOPY_K_CNC +#define IMATCOPY_K_RNC ZIMATCOPY_K_RNC +#define IMATCOPY_K_CTC ZIMATCOPY_K_CTC +#define IMATCOPY_K_RTC ZIMATCOPY_K_RTC + #define GEADD_K ZGEADD_K #else @@ -2160,6 +2178,14 @@ #define OMATCOPY_K_RNC COMATCOPY_K_RNC #define OMATCOPY_K_CTC COMATCOPY_K_CTC #define OMATCOPY_K_RTC COMATCOPY_K_RTC +#define IMATCOPY_K_CN CIMATCOPY_K_CN +#define IMATCOPY_K_RN CIMATCOPY_K_RN +#define IMATCOPY_K_CT CIMATCOPY_K_CT +#define IMATCOPY_K_RT CIMATCOPY_K_RT +#define IMATCOPY_K_CNC CIMATCOPY_K_CNC +#define IMATCOPY_K_RNC CIMATCOPY_K_RNC +#define IMATCOPY_K_CTC CIMATCOPY_K_CTC +#define IMATCOPY_K_RTC CIMATCOPY_K_RTC #define GEADD_K CGEADD_K diff --git a/common_param.h b/common_param.h index 1b56e85..ab40dde 100644 --- a/common_param.h +++ b/common_param.h @@ -830,31 +830,61 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); + int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); + int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*comatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + int (*cimatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zomatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*zimatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); diff --git a/common_s.h b/common_s.h index a4d8679..3c16008 100644 --- a/common_s.h +++ b/common_s.h @@ -152,6 +152,10 @@ #define SOMATCOPY_K_RN somatcopy_k_rn #define SOMATCOPY_K_CT somatcopy_k_ct #define SOMATCOPY_K_RT somatcopy_k_rt +#define SIMATCOPY_K_CN simatcopy_k_cn +#define SIMATCOPY_K_RN simatcopy_k_rn +#define SIMATCOPY_K_CT simatcopy_k_ct +#define SIMATCOPY_K_RT simatcopy_k_rt #define SGEADD_K sgeadd_k @@ -274,6 +278,10 @@ #define SOMATCOPY_K_RN gotoblas -> somatcopy_k_rn #define SOMATCOPY_K_CT gotoblas -> somatcopy_k_ct #define SOMATCOPY_K_RT gotoblas -> somatcopy_k_rt +#define SIMATCOPY_K_CN gotoblas -> simatcopy_k_cn +#define SIMATCOPY_K_RN gotoblas -> simatcopy_k_rn +#define SIMATCOPY_K_CT gotoblas -> simatcopy_k_ct +#define SIMATCOPY_K_RT gotoblas -> simatcopy_k_rt #define SGEADD_K gotoblas -> sgeadd_k diff --git a/common_z.h b/common_z.h index b171227..b4f58bb 100644 --- a/common_z.h +++ b/common_z.h @@ -220,6 +220,15 @@ #define ZOMATCOPY_K_CTC zomatcopy_k_ctc #define ZOMATCOPY_K_RTC zomatcopy_k_rtc +#define ZIMATCOPY_K_CN zimatcopy_k_cn +#define ZIMATCOPY_K_RN zimatcopy_k_rn +#define ZIMATCOPY_K_CT zimatcopy_k_ct +#define ZIMATCOPY_K_RT zimatcopy_k_rt +#define ZIMATCOPY_K_CNC zimatcopy_k_cnc +#define ZIMATCOPY_K_RNC zimatcopy_k_rnc +#define ZIMATCOPY_K_CTC zimatcopy_k_ctc +#define ZIMATCOPY_K_RTC zimatcopy_k_rtc + #define ZGEADD_K zgeadd_k #else @@ -404,6 +413,15 @@ #define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc #define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc +#define ZIMATCOPY_K_CN gotoblas -> zimatcopy_k_cn +#define ZIMATCOPY_K_RN gotoblas -> zimatcopy_k_rn +#define ZIMATCOPY_K_CT gotoblas -> zimatcopy_k_ct +#define ZIMATCOPY_K_RT gotoblas -> zimatcopy_k_rt +#define ZIMATCOPY_K_CNC gotoblas -> zimatcopy_k_cnc +#define ZIMATCOPY_K_RNC gotoblas -> zimatcopy_k_rnc +#define ZIMATCOPY_K_CTC gotoblas -> zimatcopy_k_ctc +#define ZIMATCOPY_K_RTC gotoblas -> zimatcopy_k_rtc + #define ZGEADD_K gotoblas -> zgeadd_k #endif diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 89f0ec8..f4309a8 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -26,7 +26,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /*********************************************************** - * 2014/06/10 Saar + * 2014-06-10 Saar + * 2015-09-07 grisuthedragon ***********************************************************/ #include @@ -50,6 +51,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef malloc #undef free +/* Enables the New IMATCOPY code with inplace operation if lda == ldb */ +#define NEW_IMATCOPY + #ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) { @@ -75,7 +79,6 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, #else void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT calpha, FLOAT *a, blasint clda, blasint cldb) { - char Order, Trans; int order=-1,trans=-1; blasint info = -1; FLOAT *b; @@ -117,6 +120,34 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } +#ifdef NEW_IMATCOPY + if ( *lda == *ldb ) { + if ( order == BlasColMajor ) + { + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda ); + } + else + { + IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda ); + } + } + else + { + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda ); + } + else + { + IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda ); + } + } + return; + } + +#endif if ( *lda > *ldb ) msize = (*lda) * (*ldb) * sizeof(FLOAT); diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index 3f273cf..798bff5 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -26,7 +26,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /*********************************************************** - * 2014/06/10 Saar + * 2014-06-10 Saar + * 2015-09-07 grisuthedragon ***********************************************************/ #include @@ -49,6 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BlasTransConj 2 #define BlasConj 3 +#define NEW_IMATCOPY #ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) @@ -124,6 +126,52 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, return; } +#ifdef NEW_IMATCOPY + if (*lda == *ldb) { + if ( order == BlasColMajor ) + { + + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasConj ) + { + IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTrans ) + { + IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTransConj ) + { + IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + } + else + { + + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasConj ) + { + IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTrans ) + { + IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTransConj ) + { + IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + } + return(0); + } +#endif + if ( *lda > *ldb ) msize = (*lda) * (*ldb) * sizeof(FLOAT) * 2; else diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4ef351d..7da4bcb 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -334,11 +334,15 @@ endif SBLASOBJS += \ somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + simatcopy_k_cn$(TSUFFIX).$(SUFFIX) simatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + simatcopy_k_ct$(TSUFFIX).$(SUFFIX) simatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ sgeadd_k$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ dgeadd_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ @@ -346,6 +350,10 @@ CBLASOBJS += \ comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ cgeadd_k$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ @@ -353,6 +361,10 @@ ZBLASOBJS += \ zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ zgeadd_k$(TSUFFIX).$(SUFFIX) @@ -3305,6 +3317,34 @@ endif $(KDIR)domatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ +ifndef DIMATCOPY_CN +DIMATCOPY_CN = ../generic/imatcopy_cn.c +endif + +$(KDIR)dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DIMATCOPY_RN +DIMATCOPY_RN = ../generic/imatcopy_rn.c +endif + +$(KDIR)dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef DIMATCOPY_CT +DIMATCOPY_CT = ../generic/imatcopy_ct.c +endif + +$(KDIR)dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DIMATCOPY_RT +DIMATCOPY_RT = ../generic/imatcopy_rt.c +endif + +$(KDIR)dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ + ifndef SOMATCOPY_CN SOMATCOPY_CN = ../arm/omatcopy_cn.c endif @@ -3333,6 +3373,34 @@ endif $(KDIR)somatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ +ifndef SIMATCOPY_CN +SIMATCOPY_CN = ../generic/imatcopy_cn.c +endif + +$(KDIR)simatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef SIMATCOPY_RN +SIMATCOPY_RN = ../generic/imatcopy_rn.c +endif + +$(KDIR)simatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef SIMATCOPY_CT +SIMATCOPY_CT = ../generic/imatcopy_ct.c +endif + +$(KDIR)simatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef SIMATCOPY_RT +SIMATCOPY_RT = ../generic/imatcopy_rt.c +endif + +$(KDIR)simatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ + ifndef COMATCOPY_CN COMATCOPY_CN = ../arm/zomatcopy_cn.c @@ -3390,6 +3458,63 @@ endif $(KDIR)comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RTC) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ +ifndef CIMATCOPY_CN +CIMATCOPY_CN = ../generic/zimatcopy_cn.c +endif + +$(KDIR)cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_RN +CIMATCOPY_RN = ../generic/zimatcopy_rn.c +endif + +$(KDIR)cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_CT +CIMATCOPY_CT = ../generic/zimatcopy_ct.c +endif + +$(KDIR)cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_RT +CIMATCOPY_RT = ../generic/zimatcopy_rt.c +endif + +$(KDIR)cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_CNC +CIMATCOPY_CNC = ../generic/zimatcopy_cnc.c +endif + +$(KDIR)cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CNC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_RNC +CIMATCOPY_RNC = ../generic/zimatcopy_rnc.c +endif + +$(KDIR)cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RNC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_CTC +CIMATCOPY_CTC = ../generic/zimatcopy_ctc.c +endif + +$(KDIR)cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CTC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_RTC +CIMATCOPY_RTC = ../generic/zimatcopy_rtc.c +endif + +$(KDIR)cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RTC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + + ifndef ZOMATCOPY_CN ZOMATCOPY_CN = ../arm/zomatcopy_cn.c @@ -3447,6 +3572,62 @@ endif $(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ +ifndef ZIMATCOPY_CN +ZIMATCOPY_CN = ../generic/zimatcopy_cn.c +endif + +$(KDIR)zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_RN +ZIMATCOPY_RN = ../generic/zimatcopy_rn.c +endif + +$(KDIR)zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_CT +ZIMATCOPY_CT = ../generic/zimatcopy_ct.c +endif + +$(KDIR)zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_RT +ZIMATCOPY_RT = ../generic/zimatcopy_rt.c +endif + +$(KDIR)zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_CNC +ZIMATCOPY_CNC = ../generic/zimatcopy_cnc.c +endif + +$(KDIR)zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CNC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_RNC +ZIMATCOPY_RNC = ../generic/zimatcopy_rnc.c +endif + +$(KDIR)zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RNC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_CTC +ZIMATCOPY_CTC = ../generic/zimatcopy_ctc.c +endif + +$(KDIR)zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CTC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_RTC +ZIMATCOPY_RTC = ../generic/zimatcopy_rtc.c +endif + +$(KDIR)zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RTC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + ifndef SGEADD_K SGEADD_K = ../generic/geadd.c diff --git a/kernel/generic/imatcopy_cn.c b/kernel/generic/imatcopy_cn.c new file mode 100644 index 0000000..e63bc97 --- /dev/null +++ b/kernel/generic/imatcopy_cn.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +/***************************************************** + * 2015-09-07 grisuthedragon +******************************************************/ + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda) +{ + BLASLONG i,j; + FLOAT *aptr; + + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + if ( alpha == 1.0 ) return(0); + + aptr = a; + if ( alpha == 0.0 ) + { + for ( i=0; i