-ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
-include $(KERNELDIR)/KERNEL.POWER8
-else
#SGEMM_BETA = ../generic/gemm_beta.c
#DGEMM_BETA = ../generic/gemm_beta.c
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_power10.S
+#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
-endif
#endif
const float *mvecp = mvec;
/* We have to load reverse mask for big endian. */
- /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
+#else
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
+#endif
+
long ytmp;
__asm__
"xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "stxv 48, 0(%4) \n\t"
+ "stxv 49, 16(%4) \n\t"
+ "stxv 50, 32(%4) \n\t"
+ "stxv 51, 48(%4) \n\t"
+ "stxv 34, 64(%4) \n\t"
+ "stxv 35, 80(%4) \n\t"
+ "stxv 38, 96(%4) \n\t"
+ "stxv 39, 112(%4) \n\t"
+#else
"stxv 49, 0(%4) \n\t"
"stxv 48, 16(%4) \n\t"
"stxv 51, 32(%4) \n\t"
"stxv 34, 80(%4) \n\t"
"stxv 39, 96(%4) \n\t"
"stxv 38, 112(%4) \n\t"
+#endif
"addi %4, %4, 128 \n\t"
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
"xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "stxv 48, 0(%4) \n\t"
+ "stxv 49, 16(%4) \n\t"
+ "stxv 50, 32(%4) \n\t"
+ "stxv 51, 48(%4) \n\t"
+ "stxv 34, 64(%4) \n\t"
+ "stxv 35, 80(%4) \n\t"
+ "stxv 38, 96(%4) \n\t"
+ "stxv 39, 112(%4) \n\t"
+#else
"stxv 49, 0(%4) \n\t"
"stxv 48, 16(%4) \n\t"
"stxv 51, 32(%4) \n\t"
"stxv 34, 80(%4) \n\t"
"stxv 39, 96(%4) \n\t"
"stxv 38, 112(%4) \n\t"
+#endif
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
:
".align 5 \n"
"one%=: \n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "stxv 32, 0(%3) \n\t"
+ "stxv 33, 16(%3) \n\t"
+ "stxv 34, 32(%3) \n\t"
+ "stxv 35, 48(%3) \n\t"
+ "stxv 36, 64(%3) \n\t"
+ "stxv 37, 80(%3) \n\t"
+ "stxv 38, 96(%3) \n\t"
+ "stxv 39, 112(%3) \n\t"
+#else
"stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t"
"stxv 36, 80(%3) \n\t"
"stxv 39, 96(%3) \n\t"
"stxv 38, 112(%3) \n\t"
+#endif
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "stxv 40, 128(%3) \n\t"
+ "stxv 41, 144(%3) \n\t"
+ "stxv 42, 160(%3) \n\t"
+ "stxv 43, 176(%3) \n\t"
+ "stxv 44, 192(%3) \n\t"
+ "stxv 45, 208(%3) \n\t"
+ "stxv 46, 224(%3) \n\t"
+ "stxv 47, 240(%3) \n\t"
+#else
"stxv 41, 128(%3) \n\t"
"stxv 40, 144(%3) \n\t"
"stxv 43, 160(%3) \n\t"
"stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t"
+#endif
"lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t"
"lxvp 44, 192(%2) \n\t"
"bgt one%= \n"
"two%=: \n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "stxv 32, 0(%3) \n\t"
+ "stxv 33, 16(%3) \n\t"
+ "stxv 34, 32(%3) \n\t"
+ "stxv 35, 48(%3) \n\t"
+ "stxv 36, 64(%3) \n\t"
+ "stxv 37, 80(%3) \n\t"
+ "stxv 38, 96(%3) \n\t"
+ "stxv 39, 112(%3) \n\t"
+ "stxv 40, 128(%3) \n\t"
+ "stxv 41, 144(%3) \n\t"
+ "stxv 42, 160(%3) \n\t"
+ "stxv 43, 176(%3) \n\t"
+ "stxv 44, 192(%3) \n\t"
+ "stxv 45, 208(%3) \n\t"
+ "stxv 46, 224(%3) \n\t"
+ "stxv 47, 240(%3) \n\t"
+#else
"stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t"
"stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t"
-
+#endif
"#n=%1 x=%4=%2 y=%0=%3"
:
"=m" (*y),
#else
#include "common.h"
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
#include "cdot_microk_power10.c"
#else
#ifndef HAVE_KERNEL_8
if ((inc_x == 1) && (inc_y == 1)) {
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
BLASLONG n1 = n & -16;
#else
BLASLONG n1 = n & -8;
static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
{
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ __vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
+#else
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
+#endif
__asm__
(
"dcbt 0, %2 \n\t"
"xxswapd 33, 34 \n\t"
"xvaddsp 35, 35, 32 \n\t"
"xvaddsp 34, 34, 33 \n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xxpermdi 34, 35, 34, 0 \n\t"
+#else
"xxpermdi 34, 34, 35, 2 \n\t"
+#endif
"stxv 34, 0(%6) \n\t"
"#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6"
#include "cgemm_macros_power10.S"
+#if (_AIX)
+.set perm_const1, 0x0405060700010203
+.set perm_const2, 0x0c0d0e0f08090a0b
+.set save_permute_12, 0x1011121300010203
+.set save_permute_11, 0x18191a1b08090a0b
+#else
.equ perm_const1, 0x0405060700010203
.equ perm_const2, 0x0c0d0e0f08090a0b
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
.equ save_permute_11, 0x0405060714151617
-
+#endif
#ifndef NEEDPARAM
/*load reverse permute mask for big endian
uint128 = 0xc0d0e0f08090a0b0405060700010203
*/
-
+#if (_AIX)
+ lis T2, (perm_const2>>48 & 0xFFFF)
+ lis T1, (perm_const1>>48 & 0xFFFF)
+ lis T3, (save_permute_12>>48 & 0xFFFF)
+ lis T4, (save_permute_11>>48 & 0xFFFF)
+
+ ori T2, T2, (perm_const2>>32 & 0xFFFF)
+ ori T1, T1, (perm_const1>>32 & 0xFFFF)
+ ori T3, T3, (save_permute_12>>32 & 0xFFFF)
+ ori T4, T4, (save_permute_11>>32 & 0xFFFF)
+#else
lis T2, perm_const2@highest
lis T1, perm_const1@highest
lis T3, save_permute_12@highest
lis T4, save_permute_11@highest
-
ori T2, T2, perm_const2@higher
ori T1, T1, perm_const1@higher
ori T3, T3, save_permute_12@higher
ori T4, T4, save_permute_11@higher
-
+#endif
rldicr T2, T2, 32, 31
rldicr T1, T1, 32, 31
rldicr T3, T3, 32, 31
rldicr T4, T4, 32, 31
+#if (_AIX)
+ oris T2, T2, (perm_const2>>16 & 0xFFFF)
+ oris T1, T1, (perm_const1>>16 & 0xFFFF)
+ oris T3, T3, (save_permute_12>>16 & 0xFFFF)
+ oris T4, T4, (save_permute_11>>16 & 0xFFFF)
+
+ ori T2, T2, (perm_const2 & 0xFFFF)
+ ori T1, T1, (perm_const1 & 0xFFFF)
+ ori T3, T3, (save_permute_12 & 0xFFFF)
+ ori T4, T4, (save_permute_11 & 0xFFFF)
+#else
oris T2, T2, perm_const2@h
oris T1, T1, perm_const1@h
oris T3, T3, save_permute_12@h
ori T1, T1, perm_const1@l
ori T3, T3, save_permute_12@l
ori T4, T4, save_permute_11@l
-
+#endif
li r0,0
li PRE,512
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 3, 36, 34
+ xvf32gerpp 2, 37, 34
+ xvf32gerpp 1, 32, 34
+ xvf32gerpp 0, 33, 34
+ xvf32gerpp 7, 36, 35
+ xvf32gerpp 6, 37, 35
+ xvf32gerpp 5, 32, 35
+ xvf32gerpp 4, 33, 35
+#else
xvf32gerpp 3, 36, 35
xvf32gerpp 2, 37, 35
xvf32gerpp 1, 32, 35
xvf32gerpp 6, 37, 34
xvf32gerpp 5, 32, 34
xvf32gerpp 4, 33, 34
+#endif
.endm
.macro LOAD4x8_2
.endm
.macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ xvf32gerpp 3, 36, 34
+ xvf32gerpp 2, 37, 34
+ xvf32gerpp 1, 32, 34
+ xvf32gerpp 0, 33, 34
+ xvf32gerpp 7, 36, 35
+ xvf32gerpp 6, 37, 35
+ xvf32gerpp 5, 32, 35
+ xvf32gerpp 4, 33, 35
+#else
xvf32gerpp 3, 36, 35
xvf32gerpp 2, 37, 35
xvf32gerpp 1, 32, 35
xvf32gerpp 6, 37, 34
xvf32gerpp 5, 32, 34
xvf32gerpp 4, 33, 34
+#endif
.if \Complete==0
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
.endif
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ xvf32gerpp 3, 42, 38
+ xvf32gerpp 2, 43, 38
+ xvf32gerpp 1, 40, 38
+ xvf32gerpp 0, 41, 38
+ xvf32gerpp 7, 42, 39
+ xvf32gerpp 6, 43, 39
+ xvf32gerpp 5, 40, 39
+ xvf32gerpp 4, 41, 39
+#else
xvf32gerpp 3, 42, 39
xvf32gerpp 2, 43, 39
xvf32gerpp 1, 40, 39
xvf32gerpp 6, 43, 38
xvf32gerpp 5, 40, 38
xvf32gerpp 4, 41, 38
+#endif
.if \Complete==0
lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL
/* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs1, vs0, vs8, 1
+ xxpermdi vs3, vs2, vs10, 1
+ xxpermdi vs5, vs4, vs12, 1
+ xxpermdi vs7, vs6, vs14, 1
+ xxpermdi vs9, vs8, vs0, 1
+ xxpermdi vs11, vs10, vs2, 1
+#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs5, vs12, vs4, 2
xxpermdi vs7, vs14, vs6, 2
xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2
+#endif
xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs13, vs12, vs4, 1
+ xxpermdi vs15, vs14, vs6, 1
+#else
xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2
+#endif
xvaddsp vs26, vs26, vs7
xvaddsp vs27, vs27, vs5
xvaddsp vs28, vs28, vs11
xvaddsp vs30, vs30, vs15
xvaddsp vs31, vs31, vs13
#else
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ xxpermdi vs25, vs0, vs8, 1
+ xxpermdi vs24, vs2, vs10, 1
+ xxpermdi vs27, vs4, vs12, 1
+ xxpermdi vs26, vs6, vs14, 1
+ xxpermdi vs29, vs8, vs0, 1
+ xxpermdi vs28, vs10, vs2, 1
+ xxpermdi vs31, vs12, vs4, 1
+ xxpermdi vs30, vs14, vs6, 1
+#else
xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2
xxpermdi vs27, vs12, vs4, 2
xxpermdi vs31, vs4, vs12, 2
xxpermdi vs30, vs6, vs14, 2
#endif
+#endif
stxvp vs24, 0(CO)
MULT_APLHA_PART1 vs48, vs56, vs0, vs1
MULT_APLHA_PART1 vs49, vs16, vs2, vs3
RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL
/* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs1, vs0, vs8, 1
+ xxpermdi vs3, vs2, vs10, 1
+ xxpermdi vs5, vs4, vs12, 1
+ xxpermdi vs7, vs6, vs14, 1
+ xxpermdi vs9, vs8, vs0, 1
+ xxpermdi vs11, vs10, vs2, 1
+#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs5, vs12, vs4, 2
xxpermdi vs7, vs14, vs6, 2
xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2
+#endif
xvaddsp vs32, vs32, vs3
xvaddsp vs33, vs33, vs1
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs13, vs12, vs4, 1
+ xxpermdi vs15, vs14, vs6, 1
+#else
xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2
+#endif
xvaddsp vs40, vs40, vs7
xvaddsp vs41, vs41, vs5
xvaddsp vs34, vs34, vs11
xvaddsp vs42, vs42, vs15
xvaddsp vs43, vs43, vs13
#else
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ xxpermdi vs33, vs0, vs8, 1
+ xxpermdi vs32, vs2, vs10, 1
+ xxpermdi vs41, vs4, vs12, 1
+ xxpermdi vs40, vs6, vs14, 1
+ xxpermdi vs35, vs8, vs0, 1
+ xxpermdi vs34, vs10, vs2, 1
+ xxpermdi vs43, vs12, vs4, 1
+ xxpermdi vs42, vs14, vs6, 1
+#else
xxpermdi vs33, vs8, vs0, 2
xxpermdi vs32, vs10, vs2, 2
xxpermdi vs41, vs12, vs4, 2
xxpermdi vs43, vs4, vs12, 2
xxpermdi vs42, vs6, vs14, 2
#endif
+#endif
stxvp vs32, 0(T2)
stxvp vs40, 32(T2)
stxvp vs34, 0(T3)
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 3, 32, 35
+ xvf32gerpp 2, 33, 35
+ xvf32gerpp 1, 32, 34
+ xvf32gerpp 0, 33, 34
+#else
xvf32gerpp 3, 32, 34
xvf32gerpp 2, 33, 34
xvf32gerpp 1, 32, 35
xvf32gerpp 0, 33, 35
+#endif
.endm
.macro LOAD4x4_2
.endm
.macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 3, 32, 35
+ xvf32gerpp 2, 33, 35
+ xvf32gerpp 1, 32, 34
+ xvf32gerpp 0, 33, 34
+#else
xvf32gerpp 3, 32, 34
xvf32gerpp 2, 33, 34
xvf32gerpp 1, 32, 35
xvf32gerpp 0, 33, 35
+#endif
.if \Complete==0
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
.endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 3, 36, 39
+ xvf32gerpp 2, 37, 39
+ xvf32gerpp 1, 36, 38
+ xvf32gerpp 0, 37, 38
+#else
xvf32gerpp 3, 36, 38
xvf32gerpp 2, 37, 38
xvf32gerpp 1, 36, 39
xvf32gerpp 0, 37, 39
+#endif
.if \Complete==0
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL
/* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs1, vs0, vs8, 1
+ xxpermdi vs3, vs2, vs10, 1
+ xxpermdi vs9, vs8, vs0, 1
+ xxpermdi vs11, vs10, vs2, 1
+ xxpermdi vs5, vs4, vs12, 1
+ xxpermdi vs7, vs6, vs14, 1
+ xxpermdi vs13, vs12, vs4, 1
+ xxpermdi vs15, vs14, vs6, 1
+#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs9, vs0, vs8, 2
xxpermdi vs7, vs14, vs6, 2
xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2
+#endif
xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1
xvaddsp vs26, vs26, vs11
xvaddsp vs30, vs30, vs15
xvaddsp vs31, vs31, vs13
#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs25, vs0, vs8, 1
+ xxpermdi vs24, vs2, vs10, 1
+ xxpermdi vs27, vs8, vs0, 1
+ xxpermdi vs26, vs10, vs2, 1
+ xxpermdi vs29, vs4, vs12, 1
+ xxpermdi vs28, vs6, vs14, 1
+ xxpermdi vs31, vs12, vs4, 1
+ xxpermdi vs30, vs14, vs6, 1
+#else
xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2
xxpermdi vs27, vs0, vs8, 2
xxpermdi vs31, vs4, vs12, 2
xxpermdi vs30, vs6, vs14, 2
#endif
+#endif
stxvp vs24, 0(CO)
stxvp vs26, 0(T1)
stxvp vs28, 0(T2)
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 1, 35, 32
+ xvf32gerpp 0, 34, 32
+#else
xvf32gerpp 1, 34, 32
xvf32gerpp 0, 35, 32
+#endif
.endm
.macro LOAD4x2_2
.endm
.macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 1, 35, 32
+ xvf32gerpp 0, 34, 32
+#else
xvf32gerpp 1, 34, 33
xvf32gerpp 0, 35, 33
+#endif
.if \Complete==0
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
.endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 1, 37, 33
+ xvf32gerpp 0, 36, 33
+#else
xvf32gerpp 1, 36, 32
xvf32gerpp 0, 37, 32
+#endif
.if \Complete==0
lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
RECONSTRUCT_PAIR1
#ifndef TRMMKERNEL
/* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs1, vs0, vs8, 0
+ xxpermdi vs9, vs2, vs10, 0
+ xxpermdi vs3, vs8, vs0, 3
+ xxpermdi vs11, vs10, vs2, 3
+#else
xxpermdi vs1, vs8, vs0, 0
xxpermdi vs9, vs10, vs2, 0
xxpermdi vs3, vs0, vs8, 3
xxpermdi vs11, vs2, vs10, 3
+#endif
xvaddsp vs24, vs24, vs1
xvaddsp vs26, vs26, vs9
xvaddsp vs25, vs25, vs3
xvaddsp vs27, vs27, vs11
#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs24, vs0, vs8, 0
+ xxpermdi vs26, vs2, vs10, 0
+ xxpermdi vs25, vs8, vs0, 3
+ xxpermdi vs27, vs10, vs2, 3
+#else
xxpermdi vs24, vs8, vs0, 0
xxpermdi vs26, vs10, vs2, 0
xxpermdi vs25, vs0, vs8, 3
xxpermdi vs27, vs2, vs10, 3
#endif
+#endif
stxv vs24, 0(CO)
stxv vs25, 0(T1)
stxv vs26, 0(T2)
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 0, 34, 32
+ xvf32gerpp 1, 35, 32
+#else
xvf32gerpp 0, 35, 32
xvf32gerpp 1, 34, 32
+#endif
.endm
.macro LOAD4x1_2
.macro LOAD4x1_2O OffsetA, OffsetB
lxv vs32, (\OffsetA)(AO)
vspltisb v6, 0
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs33, vs32, vs38, 2
+ xxpermdi vs32, vs32, vs38, 0
+#else
xxpermdi vs33, vs32, vs38, 0
xxpermdi vs32, vs32, vs38, 2
+#endif
lxvp vs34, (0+\OffsetB)(BO)
lxvp vs36, (32+\OffsetB)(BO)
.endm
.endm
.macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 0, 34, 32
+ xvf32gerpp 1, 35, 32
+#else
xvf32gerpp 0, 35, 32
xvf32gerpp 1, 34, 32
+#endif
.if \Complete==0
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
.endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 0, 36, 33
+ xvf32gerpp 1, 37, 33
+#else
xvf32gerpp 0, 37, 33
xvf32gerpp 1, 36, 33
+#endif
.if \Complete==0
lxv vs32, DISP2(\Index, \OffsetA)(\AREG)
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs33, vs32, vs38, 2
+ xxpermdi vs32, vs32, vs38, 0
+#else
xxpermdi vs33, vs32, vs38, 0
xxpermdi vs32, vs32, vs38, 2
+#endif
.endif
.if \IsLast==1
.if \Complete==1
.endm
.macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 2, 37, 34
+ xvf32gerpp 3, 36, 34
+ xvf32gerpp 0, 33, 34
+ xvf32gerpp 1, 32, 34
+#else
xvf32gerpp 2, 37, 35
xvf32gerpp 3, 36, 35
xvf32gerpp 0, 33, 35
xvf32gerpp 1, 32, 35
+#endif
.if \Complete==0
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
.endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 2, 41, 35
+ xvf32gerpp 3, 40, 35
+ xvf32gerpp 0, 39, 35
+ xvf32gerpp 1, 38, 35
+#else
xvf32gerpp 2, 41, 34
xvf32gerpp 3, 40, 34
xvf32gerpp 0, 39, 34
xvf32gerpp 1, 38, 34
+#endif
.if \Complete==0
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL
/* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs1, vs0, vs8, 1
+ xxpermdi vs3, vs2, vs10, 1
+ xxpermdi vs5, vs4, vs12, 1
+ xxpermdi vs7, vs6, vs14, 1
+ xxpermdi vs9, vs8, vs0, 1
+ xxpermdi vs11, vs10, vs2, 1
+#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs5, vs12, vs4, 2
xxpermdi vs7, vs14, vs6, 2
xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2
+#endif
xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs13, vs12, vs4, 1
+ xxpermdi vs15, vs14, vs6, 1
+#else
xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2
+#endif
xvaddsp vs26, vs26, vs7
xvaddsp vs27, vs27, vs5
xvaddsp vs28, vs28, vs11
xvaddsp vs30, vs30, vs15
xvaddsp vs31, vs31, vs13
#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs25, vs0, vs8, 1
+ xxpermdi vs24, vs2, vs10, 1
+ xxpermdi vs27, vs4, vs12, 1
+ xxpermdi vs26, vs6, vs14, 1
+ xxpermdi vs29, vs8, vs0, 1
+ xxpermdi vs28, vs10, vs2, 1
+ xxpermdi vs31, vs12, vs4, 1
+ xxpermdi vs30, vs14, vs6, 1
+#else
xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2
xxpermdi vs27, vs12, vs4, 2
xxpermdi vs31, vs4, vs12, 2
xxpermdi vs30, vs6, vs14, 2
#endif
+#endif
stxvp vs24, 0(CO)
stxvp vs26, 32(CO)
stxvp vs28, 0(T1)
.endm
.macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 0, 33, 34
+ xvf32gerpp 1, 32, 34
+#else
xvf32gerpp 0, 33, 35
xvf32gerpp 1, 32, 35
+#endif
.if \Complete==0
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
.endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf32gerpp 0, 37, 35
+ xvf32gerpp 1, 36, 35
+#else
xvf32gerpp 0, 37, 34
xvf32gerpp 1, 36, 34
+#endif
+
.if \Complete==0
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
RECONSTRUCT_PAIR1
#ifndef TRMMKERNEL
/* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs1, vs0, vs8, 1
+ xxpermdi vs3, vs2, vs10, 1
+ xxpermdi vs9, vs8, vs0, 1
+ xxpermdi vs11, vs10, vs2, 1
+#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2
+#endif
xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1
xvaddsp vs26, vs26, vs11
xvaddsp vs27, vs27, vs9
#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs25, vs0, vs8, 1
+ xxpermdi vs24, vs2, vs10, 1
+ xxpermdi vs27, vs8, vs0, 1
+ xxpermdi vs26, vs10, vs2, 1
+#else
xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2
xxpermdi vs27, vs0, vs8, 2
xxpermdi vs26, vs2, vs10, 2
#endif
+#endif
stxvp vs24, 0(CO)
stxvp vs26, 0(T1)
addi CO, CO, 32
xxperm vs8, vs9, save_permute_1
#ifndef TRMMKERNEL
/* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs1, vs0, vs8, 0
+ xxpermdi vs9, vs8, vs0, 3
+#else
xxpermdi vs1, vs8, vs0, 0
xxpermdi vs9, vs0, vs8, 3
+#endif
xvaddsp vs24, vs24, vs1
xvaddsp vs26, vs26, vs9
#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs24, vs0, vs8, 0
+ xxpermdi vs26, vs8, vs0, 3
+#else
xxpermdi vs24, vs8, vs0, 0
xxpermdi vs26, vs0, vs8, 3
#endif
+#endif
stxv vs24, 0(CO)
stxv vs26, 0(T1)
addi CO, CO, 16
lxvp vs32, (0+\OffsetA)(AO)
lxvp vs36, (32+\OffsetA)(AO)
vspltisb v10, 0
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs35, vs34, vs42, 2
+ xxpermdi vs34, vs34, vs42, 0
+#else
xxpermdi vs35, vs34, vs42, 0
xxpermdi vs34, vs34, vs42, 2
+#endif
lxvp vs38, (64+\OffsetA)(AO)
lxvp vs40, (64+32+\OffsetA)(AO)
.endm
xvf32gerpp 3, 35, 40
.if \Complete==0
lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs35, vs34, vs42, 2
+ xxpermdi vs34, vs34, vs42, 0
+#else
xxpermdi vs35, vs34, vs42, 0
xxpermdi vs34, vs34, vs42, 2
+#endif
lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
.endif
.if \IsLast==1
MULT_APLHA_PART2 vs34, vs42, vs4, vs5
MULT_APLHA_PART2 vs35, vs43, vs6, vs7
/* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxperm vs0, vs1, save_permute_1
+ xxperm vs2, vs3, save_permute_1
+ xxperm vs4, vs5, save_permute_1
+ xxperm vs6, vs7, save_permute_1
+#else
xxperm vs0, vs1, vs28
xxperm vs2, vs3, vs28
xxperm vs4, vs5, vs28
xxperm vs6, vs7, vs28
+#endif
#ifndef TRMMKERNEL
/* add */
xvaddsp vs24, vs24, vs2
stxvp vs26, 32(CO)
#else
/* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ stxv vs2, 0(CO)
+ stxv vs0, 16(CO)
+ stxv vs6, 32(CO)
+ stxv vs4, 48(CO)
+#else
stxv vs0, 0(CO)
stxv vs2, 16(CO)
stxv vs4, 32(CO)
stxv vs6, 48(CO)
#endif
+#endif
addi CO, CO, 64
.endm
lxv vs34, (\OffsetB)(BO)
lxvp vs32, (0+\OffsetA)(AO)
vspltisb v6, 0
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs35, vs34, vs38, 2
+ xxpermdi vs34, vs34, vs38, 0
+#else
xxpermdi vs35, vs34, vs38, 0
xxpermdi vs34, vs34, vs38, 2
+#endif
lxvp vs36, (32+\OffsetA)(AO)
.endm
xvf32gerpp 1, 35, 36
.if \Complete==0
lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxpermdi vs35, vs34, vs38, 2
+ xxpermdi vs34, vs34, vs38, 0
+#else
xxpermdi vs35, vs34, vs38, 0
xxpermdi vs34, vs34, vs38, 2
+#endif
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
.endif
.if \IsLast==1
MULT_APLHA_PART2 vs32, vs40, vs0, vs1
MULT_APLHA_PART2 vs33, vs41, vs2, vs3
/* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxperm vs0, vs1, save_permute_1
+ xxperm vs2, vs3, save_permute_1
+#else
xxperm vs0, vs1, vs28
xxperm vs2, vs3, vs28
+#endif
#ifndef TRMMKERNEL
/* add */
xvaddsp vs24, vs24, vs2
stxvp vs24, 0(CO)
#else
/* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ stxv vs2, 0(CO)
+ stxv vs0, 16(CO)
+#else
stxv vs0, 0(CO)
stxv vs2, 16(CO)
#endif
+#endif
addi CO, CO, 32
.endm
MULT_APLHA_PART1 vs32, vs40, vs0, vs1
MULT_APLHA_PART2 vs32, vs40, vs0, vs1
/* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxperm vs0, vs1, save_permute_1
+#else
xxperm vs0, vs1, vs28
+#endif
#ifndef TRMMKERNEL
/* add */
xvaddsp vs24, vs24, vs0
MULT_APLHA_PART1 vs32, vs40, vs37, vs1
MULT_APLHA_PART2 vs32, vs40, vs37, vs1
/* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxperm vs37, vs1, save_permute_1
+#else
xxperm vs37, vs1, vs28
+#endif
#ifndef TRMMKERNEL
/* add */
xvaddsp vs36, vs36, vs37
static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
{
__vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i};
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ __vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
+#else
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
+#endif
__asm__
(
"dcbt 0, %2 \n\t"
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "cswap_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "cswap_microk_power10.c"
#elif defined(POWER10)
-#include "cswap_microk_power8.c"
+#include "cswap_microk_power10.c"
#endif
#endif
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "dasum_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "dasum_microk_power10.c"
#elif defined(POWER10)
-#include "dasum_microk_power8.c"
+#include "dasum_microk_power10.c"
#endif
#endif
-
#ifndef HAVE_KERNEL_16
static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
if ( inc_x == 1 )
{
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
if ( n >= 32)
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
XXSPLTD_S(32,%x9,0) // alpha, alpha
"sldi %6, %13, 3 \n\t" // lda * sizeof (double)
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha
+ "xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha
+#else
"xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha
"xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha
+#endif
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
"add %6, %6, %6 \n\t" // 2 * lda
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
+ XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
+ XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
+ XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
+#else
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
-
+#endif
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
"add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda
"add %10, %10, %10 \n\t" // 2 * lda
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
+ XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
+ XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
+ XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
+ XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha
+ XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha
+ XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha
+ XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha
+#else
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha
XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha
XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha
+#endif
"add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda
"one%=: \n\t"
"lxvp 36, 0( %2) \n\t" // y0, y1
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xvmaddadp 36, 40, 32 \n\t"
+ "xvmaddadp 37, 41, 32 \n\t"
+#else
"xvmaddadp 36, 40, 34 \n\t"
"xvmaddadp 37, 41, 34 \n\t"
+#endif
"lxvpx 40, %3, %11 \n\t" // a0[0], a0[1]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xvmaddadp 36, 42, 33 \n\t"
+ "xvmaddadp 37, 43, 33 \n\t"
+#else
"xvmaddadp 36, 42, 35 \n\t"
"xvmaddadp 37, 43, 35 \n\t"
+#endif
"lxvpx 42, %4, %11 \n\t" // a1[0], a1[1]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xvmaddadp 36, 44, 34 \n\t"
+ "xvmaddadp 37, 45, 34 \n\t"
+#else
"xvmaddadp 36, 44, 32 \n\t"
"xvmaddadp 37, 45, 32 \n\t"
+#endif
"lxvpx 44, %5, %11 \n\t" // a2[0], a2[1]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xvmaddadp 36, 46, 35 \n\t"
+ "xvmaddadp 37, 47, 35 \n\t"
+#else
"xvmaddadp 36, 46, 33 \n\t"
"xvmaddadp 37, 47, 33 \n\t"
+#endif
"lxvpx 46, %6, %11 \n\t" // a3[0], a3[1]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xvmaddadp 36, 50, 38 \n\t"
+ "xvmaddadp 37, 51, 38 \n\t"
+#else
"xvmaddadp 36, 50, 48 \n\t"
"xvmaddadp 37, 51, 48 \n\t"
+#endif
"lxvpx 50, %7, %11 \n\t" // a4[0]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xvmaddadp 36, 52, 39 \n\t"
+ "xvmaddadp 37, 53, 39 \n\t"
+#else
"xvmaddadp 36, 52, 49 \n\t"
"xvmaddadp 37, 53, 49 \n\t"
+#endif
"lxvpx 52, %8, %11 \n\t" // a5[0]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xvmaddadp 36, 54, 48 \n\t"
+ "xvmaddadp 37, 55, 48 \n\t"
+#else
"xvmaddadp 36, 54, 38 \n\t"
"xvmaddadp 37, 55, 38 \n\t"
+#endif
"lxvpx 54, %9, %11 \n\t" // a6[0]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xvmaddadp 36, 56, 49 \n\t"
+ "xvmaddadp 37, 57, 49 \n\t"
+#else
"xvmaddadp 36, 56, 39 \n\t"
"xvmaddadp 37, 57, 39 \n\t"
+#endif
"lxvpx 56, %10, %11 \n\t" // a7[0]
"addi %11, %11, 32 \n\t"
"two%=: \n\t"
"lxvp 36, 0( %2) \n\t" // y0, y1
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xvmaddadp 36, 40, 32 \n\t"
+ "xvmaddadp 37, 41, 32 \n\t"
+ "xvmaddadp 36, 42, 33 \n\t"
+ "xvmaddadp 37, 43, 33 \n\t"
+ "xvmaddadp 36, 44, 34 \n\t"
+ "xvmaddadp 37, 45, 34 \n\t"
+ "xvmaddadp 36, 46, 35 \n\t"
+ "xvmaddadp 37, 47, 35 \n\t"
+ "xvmaddadp 36, 50, 38 \n\t"
+ "xvmaddadp 37, 51, 38 \n\t"
+ "xvmaddadp 36, 52, 39 \n\t"
+ "xvmaddadp 37, 53, 39 \n\t"
+ "xvmaddadp 36, 54, 48 \n\t"
+ "xvmaddadp 37, 55, 48 \n\t"
+ "xvmaddadp 36, 56, 49 \n\t"
+ "xvmaddadp 37, 57, 49 \n\t"
+#else
"xvmaddadp 36, 40, 34 \n\t"
"xvmaddadp 37, 41, 34 \n\t"
"xvmaddadp 36, 42, 35 \n\t"
"xvmaddadp 37, 55, 38 \n\t"
"xvmaddadp 36, 56, 39 \n\t"
"xvmaddadp 37, 57, 39 \n\t"
+#endif
"stxvp 36, 0( %2) \n\t" // y0, y1
:
"lxvp 40, 32(%[y]) \n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ XXMRGHD_S(42,34,35)
+ XXMRGLD_S(43,34,35)
+ XXMRGHD_S(44,4,5)
+ XXMRGLD_S(45,4,5)
+#else
XXMRGLD_S(42,35,34)
XXMRGHD_S(43,35,34)
XXMRGLD_S(44,5,4)
XXMRGHD_S(45,5,4)
+#endif
"xvadddp 42,42,43 \n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ XXMRGHD_S(46,6,7)
+ XXMRGLD_S(47,6,7)
+#else
XXMRGLD_S(46,7,6)
XXMRGHD_S(47,7,6)
-
+#endif
"xvadddp 44,44,45 \n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ XXMRGHD_S(48,8,9)
+ XXMRGLD_S(49,8,9)
+#else
XXMRGLD_S(48,9,8)
XXMRGHD_S(49,9,8)
-
+#endif
"xvadddp 46,46,47 \n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xvmaddadp 38,42,36 \n\t"
+ "xvmaddadp 39,44,36 \n\t"
+#else
"xvmaddadp 39,42,36 \n\t"
"xvmaddadp 38,44,36 \n\t"
-
+#endif
"xvadddp 48,48,49 \n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xvmaddadp 41,48,36 \n\t"
+#else
"xvmaddadp 41,46,36 \n\t"
-
+#endif
"stxvp 38, 0(%[y]) \n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "xvmaddadp 40,46,36 \n\t"
+#else
"xvmaddadp 40,48,36 \n\t"
+#endif
"stxvp 40, 32(%[y]) \n\t"
: [memy] "+m" (*(double (*)[8])y),
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "drot_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "drot_microk_power10.c"
#elif defined(POWER10)
-#include "drot_microk_power8.c"
+#include "drot_microk_power10.c"
#endif
#endif
if ( (inc_x == 1) && (inc_y == 1) )
{
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "dscal_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "dscal_microk_power10.c"
#elif defined(POWER10)
-#include "dscal_microk_power8.c"
+#include "dscal_microk_power10.c"
#endif
#endif
if ( da == 0.0 )
{
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
else
{
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "dswap_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "swap_microk_power10.c"
#elif defined(POWER10)
-#include "dswap_microk_power8.c"
+#include "swap_microk_power10.c"
#endif
#endif
if ( (inc_x == 1) && (inc_y == 1 ))
{
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "sasum_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "sasum_microk_power10.c"
#elif defined(POWER10)
-#include "sasum_microk_power8.c"
+#include "sasum_microk_power10.c"
#endif
#endif
if ( inc_x == 1 )
{
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "srot_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "srot_microk_power10.c"
#elif defined(POWER10)
-#include "srot_microk_power8.c"
+#include "srot_microk_power10.c"
#endif
#endif
if ( (inc_x == 1) && (inc_y == 1) )
{
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "sscal_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "sscal_microk_power10.c"
#elif defined(POWER10)
-#include "sscal_microk_power8.c"
+#include "sscal_microk_power10.c"
#endif
#endif
if ( da == 0.0 )
{
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
else
{
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "sswap_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "swap_microk_power10.c"
#elif defined(POWER10)
-#include "sswap_microk_power8.c"
+#include "swap_microk_power10.c"
#endif
#endif
if ( (inc_x == 1) && (inc_y == 1 ))
{
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
if ( n >= 64 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
double alpha_r, double alpha_i)
{
#if !defined(CONJ)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ static const double mvec[2] = { -1.0, 1.0 };
+#else
+ static const double mvec[2] = { 1.0, -1.0 };
+#endif
+#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
static const double mvec[2] = { 1.0, -1.0 };
#else
static const double mvec[2] = { -1.0, 1.0 };
#endif
+#endif
const double *mvecp = mvec;
__vector double t0;
std r0, FLINK_SAVE(SP)
-#if defined(linux) || defined(__FreeBSD__)
+#if defined(linux) || defined(__FreeBSD__) || defined(_AIX)
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#ifdef TRMMKERNEL
-#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__) || defined(_AIX)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif
#endif
#ifndef TRMMKERNEL
lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)
lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxmrghd \VS_OUT1,\VS_TEMP1,\VS_TEMP2
+ xxmrgld \VS_OUT2,\VS_TEMP1,\VS_TEMP2
+#else
xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2
xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2
+#endif
#endif
.endm
/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
+ xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
+#else
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
+#endif
.endm
/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
+ xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
+#else
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
+#endif
.endm
/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxmrghd \VSOUT1,\VSIN1,\VSIN2
+ xxmrgld \VSOUT2,\VSIN1,\VSIN2
+#else
xxmrghd \VSOUT1,\VSIN2,\VSIN1
xxmrgld \VSOUT2,\VSIN2,\VSIN1
+#endif
.endm
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35
#ifndef TRMMKERNEL
lxv vs50, (\LOFFSET)(\BASE_REG)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxmrghd vs46,vs50,vs50
+ xxmrgld vs47,vs50,vs50
+#else
xxmrgld vs46,vs50,vs50
xxmrghd vs47,vs50,vs50
+#endif
#endif
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
MULT_APLHA_PART2 vs34,vs36, vs46,vs47
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
+#if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
xxmrghd vs39,vs47,vs46
+#endif
stxv vs39, (\LOFFSET)(\BASE_REG)
.endm
lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A
lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf64gerpp 0, vs32, vs48
+ xvf64gerpp 1, vs34, vs48
+ xvf64gerpp 2, vs36, vs48
+ xvf64gerpp 3, vs38, vs48
+ xvf64gerpp 4, vs32, vs49
+ xvf64gerpp 5, vs34, vs49
+ xvf64gerpp 6, vs36, vs49
+ xvf64gerpp 7, vs38, vs49
+#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs36, vs49
xvf64gerpp 5, vs34, vs48
xvf64gerpp 6, vs36, vs48
xvf64gerpp 7, vs38, vs48
+#endif
lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A
lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A
lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A
lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A
lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf64gerpp 0, vs40, vs50
+ xvf64gerpp 1, vs42, vs50
+ xvf64gerpp 2, vs44, vs50
+ xvf64gerpp 3, vs46, vs50
+ xvf64gerpp 4, vs40, vs51
+ xvf64gerpp 5, vs42, vs51
+ xvf64gerpp 6, vs44, vs51
+ xvf64gerpp 7, vs46, vs51
+#else
xvf64gerpp 0, vs40, vs51
xvf64gerpp 1, vs42, vs51
xvf64gerpp 2, vs44, vs51
xvf64gerpp 5, vs42, vs50
xvf64gerpp 6, vs44, vs50
xvf64gerpp 7, vs46, vs50
+#endif
.if \IsLast==1
addi AO, AO, DISP16(\Index,256)
addi BO, BO, DISP4(\Index,64)
.macro LOAD_END_2x8 OffsetA,OffsetB
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf64gerpp 0, vs32, vs48
+ xvf64gerpp 1, vs34, vs48
+ xvf64gerpp 2, vs36, vs48
+ xvf64gerpp 3, vs38, vs48
+ xvf64gerpp 4, vs32, vs49
+ xvf64gerpp 5, vs34, vs49
+ xvf64gerpp 6, vs36, vs49
+ xvf64gerpp 7, vs38, vs49
+#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs36, vs49
xvf64gerpp 5, vs34, vs48
xvf64gerpp 6, vs36, vs48
xvf64gerpp 7, vs38, vs48
+#endif
addi BO, BO, \OffsetB
addi AO, AO, \OffsetA
.endm
xxpermdi vs45, vs12, vs13, 0b10
xxpermdi vs46, vs14, vs15, 0b01
xxpermdi vs47, vs14, vs15, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxlor vs0, vs32, vs32
+ xxlor vs1, vs33, vs33
+ xxlor vs2, vs34, vs34
+ xxlor vs3, vs35, vs35
+ xxlor vs4, vs36, vs36
+ xxlor vs5, vs37, vs37
+ xxlor vs6, vs38, vs38
+ xxlor vs7, vs39, vs39
+ xxlor vs8, vs40, vs40
+ xxlor vs9, vs41, vs41
+ xxlor vs10, vs42, vs42
+ xxlor vs11, vs43, vs43
+ xxlor vs12, vs44, vs44
+ xxlor vs13, vs45, vs45
+ xxlor vs14, vs46, vs46
+ xxlor vs15, vs47, vs47
+#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
xxlor vs15, vs45, vs45
xxlor vs12, vs46, vs46
xxlor vs13, vs47, vs47
-
+#endif
xxpermdi vs32, vs16, vs17, 0b01
xxpermdi vs33, vs16, vs17, 0b10
xxpermdi vs34, vs18, vs19, 0b01
xxpermdi vs45, vs28, vs29, 0b10
xxpermdi vs46, vs30, vs31, 0b01
xxpermdi vs47, vs30, vs31, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxlor vs16, vs32, vs32
+ xxlor vs17, vs33, vs33
+ xxlor vs18, vs34, vs34
+ xxlor vs19, vs35, vs35
+ xxlor vs20, vs36, vs36
+ xxlor vs21, vs37, vs37
+ xxlor vs22, vs38, vs38
+ xxlor vs23, vs39, vs39
+ xxlor vs24, vs40, vs40
+ xxlor vs25, vs41, vs41
+ xxlor vs26, vs42, vs42
+ xxlor vs27, vs43, vs43
+ xxlor vs28, vs44, vs44
+ xxlor vs29, vs45, vs45
+ xxlor vs30, vs46, vs46
+ xxlor vs31, vs47, vs47
+#else
xxlor vs18, vs32, vs32
xxlor vs19, vs33, vs33
xxlor vs16, vs34, vs34
xxlor vs31, vs45, vs45
xxlor vs28, vs46, vs46
xxlor vs29, vs47, vs47
-
+#endif
SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0
addi CO, CO, 128
lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
- xvf64gerpp 0, vs32, vs49
- xvf64gerpp 1, vs34, vs49
- xvf64gerpp 2, vs32, vs48
- xvf64gerpp 3, vs34, vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf64gerpp 0, vs32, vs48
+ xvf64gerpp 1, vs34, vs48
+ xvf64gerpp 2, vs32, vs49
+ xvf64gerpp 3, vs34, vs49
+#else
+ xvf64gerpp 0, vs32, vs49
+ xvf64gerpp 1, vs34, vs49
+ xvf64gerpp 2, vs32, vs48
+ xvf64gerpp 3, vs34, vs48
+#endif
lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A
lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A
lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B
- xvf64gerpp 0, vs40, vs51
- xvf64gerpp 1, vs42, vs51
- xvf64gerpp 2, vs40, vs50
- xvf64gerpp 3, vs42, vs50
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf64gerpp 0, vs40, vs50
+ xvf64gerpp 1, vs42, vs50
+ xvf64gerpp 2, vs40, vs51
+ xvf64gerpp 3, vs42, vs51
+#else
+ xvf64gerpp 0, vs40, vs51
+ xvf64gerpp 1, vs42, vs51
+ xvf64gerpp 2, vs40, vs50
+ xvf64gerpp 3, vs42, vs50
+#endif
.if \IsLast==1
addi AO, AO, DISP8(\Index,128)
addi BO, BO, DISP4(\Index,64)
.macro LOAD_END_2x4 OffsetA, OffsetB
- xvf64gerpp 0, vs32, vs49
- xvf64gerpp 1, vs34, vs49
- xvf64gerpp 2, vs32, vs48
- xvf64gerpp 3, vs34, vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf64gerpp 0, vs32, vs48
+ xvf64gerpp 1, vs34, vs48
+ xvf64gerpp 2, vs32, vs49
+ xvf64gerpp 3, vs34, vs49
+#else
+ xvf64gerpp 0, vs32, vs49
+ xvf64gerpp 1, vs34, vs49
+ xvf64gerpp 2, vs32, vs48
+ xvf64gerpp 3, vs34, vs48
+#endif
addi BO, BO, \OffsetB
addi AO, AO, \OffsetA
.endm
xxpermdi vs45, vs12, vs13, 0b10
xxpermdi vs46, vs14, vs15, 0b01
xxpermdi vs47, vs14, vs15, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxlor vs0, vs32, vs32
+ xxlor vs1, vs33, vs33
+ xxlor vs2, vs34, vs34
+ xxlor vs3, vs35, vs35
+ xxlor vs4, vs36, vs36
+ xxlor vs5, vs37, vs37
+ xxlor vs6, vs38, vs38
+ xxlor vs7, vs39, vs39
+ xxlor vs8, vs40, vs40
+ xxlor vs9, vs41, vs41
+ xxlor vs10, vs42, vs42
+ xxlor vs11, vs43, vs43
+ xxlor vs12, vs44, vs44
+ xxlor vs13, vs45, vs45
+ xxlor vs14, vs46, vs46
+ xxlor vs15, vs47, vs47
+#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
xxlor vs15, vs45, vs45
xxlor vs12, vs46, vs46
xxlor vs13, vs47, vs47
-
+#endif
SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0
addi CO, CO, 64
.macro KERNEL2x2_2 Index, IsLast
lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
- xvf64gerpp 0, vs32, vs49
- xvf64gerpp 1, vs32, vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf64gerpp 0, vs32, vs48
+ xvf64gerpp 1, vs32, vs49
+#else
+ xvf64gerpp 0, vs32, vs49
+ xvf64gerpp 1, vs32, vs48
+#endif
lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A
lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
- xvf64gerpp 0, vs40, vs51
- xvf64gerpp 1, vs40, vs50
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf64gerpp 0, vs40, vs50
+ xvf64gerpp 1, vs40, vs51
+#else
+ xvf64gerpp 0, vs40, vs51
+ xvf64gerpp 1, vs40, vs50
+#endif
.if \IsLast==1
addi AO, AO, DISP4(\Index,64)
addi BO, BO, DISP4(\Index,64)
.macro LOAD_END_2x2 OffsetA,OffsetB
- xvf64gerpp 0, vs32, vs49
- xvf64gerpp 1, vs32, vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf64gerpp 0, vs32, vs48
+ xvf64gerpp 1, vs32, vs49
+#else
+ xvf64gerpp 0, vs32, vs49
+ xvf64gerpp 1, vs32, vs48
+#endif
addi BO, BO, \OffsetB
addi AO, AO, \OffsetA
.endm
xxpermdi vs37, vs4, vs5, 0b10
xxpermdi vs38, vs6, vs7, 0b01
xxpermdi vs39, vs6, vs7, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxlor vs0, vs32, vs32
+ xxlor vs1, vs33, vs33
+ xxlor vs2, vs34, vs34
+ xxlor vs3, vs35, vs35
+ xxlor vs4, vs36, vs36
+ xxlor vs5, vs37, vs37
+ xxlor vs6, vs38, vs38
+ xxlor vs7, vs39, vs39
+#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
xxlor vs7, vs37, vs37
xxlor vs4, vs38, vs38
xxlor vs5, vs39, vs39
-
+#endif
SAVE2 vs0,vs1,vs2,vs3,CO,0
SAVE2 vs4,vs5,vs6,vs7,T1,0
addi CO, CO, 32
lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A
lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
- xvf64gerpp 0, vs32, vs49
- xvf64gerpp 1, vs34, vs49
- xvf64gerpp 2, vs36, vs49
- xvf64gerpp 3, vs38, vs49
- xvf64gerpp 0, vs40, vs48
- xvf64gerpp 1, vs42, vs48
- xvf64gerpp 2, vs44, vs48
- xvf64gerpp 3, vs46, vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf64gerpp 0, vs32, vs48
+ xvf64gerpp 1, vs34, vs48
+ xvf64gerpp 2, vs36, vs48
+ xvf64gerpp 3, vs38, vs48
+ xvf64gerpp 0, vs40, vs49
+ xvf64gerpp 1, vs42, vs49
+ xvf64gerpp 2, vs44, vs49
+ xvf64gerpp 3, vs46, vs49
+#else
+ xvf64gerpp 0, vs32, vs49
+ xvf64gerpp 1, vs34, vs49
+ xvf64gerpp 2, vs36, vs49
+ xvf64gerpp 3, vs38, vs49
+ xvf64gerpp 0, vs40, vs48
+ xvf64gerpp 1, vs42, vs48
+ xvf64gerpp 2, vs44, vs48
+ xvf64gerpp 3, vs46, vs48
+#endif
.if \IsLast==1
addi AO, AO, DISP16(\Index,256)
addi BO, BO, DISP2(\Index,32)
xxpermdi vs45, vs12, vs13, 0b10
xxpermdi vs46, vs14, vs15, 0b01
xxpermdi vs47, vs14, vs15, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxlor vs0, vs32, vs32
+ xxlor vs1, vs33, vs33
+ xxlor vs2, vs34, vs34
+ xxlor vs3, vs35, vs35
+ xxlor vs4, vs36, vs36
+ xxlor vs5, vs37, vs37
+ xxlor vs6, vs38, vs38
+ xxlor vs7, vs39, vs39
+ xxlor vs8, vs40, vs40
+ xxlor vs9, vs41, vs41
+ xxlor vs10, vs42, vs42
+ xxlor vs11, vs43, vs43
+ xxlor vs12, vs44, vs44
+ xxlor vs13, vs45, vs45
+ xxlor vs14, vs46, vs46
+ xxlor vs15, vs47, vs47
+#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
xxlor vs15, vs45, vs45
xxlor vs12, vs46, vs46
xxlor vs13, vs47, vs47
-
+#endif
SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
addi CO, CO, 128
.endm
lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
- xvf64gerpp 0, vs32, vs49
- xvf64gerpp 1, vs34, vs49
- xvf64gerpp 0, vs40, vs48
- xvf64gerpp 1, vs42, vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf64gerpp 0, vs32, vs48
+ xvf64gerpp 1, vs34, vs48
+ xvf64gerpp 0, vs40, vs49
+ xvf64gerpp 1, vs42, vs49
+#else
+ xvf64gerpp 0, vs32, vs49
+ xvf64gerpp 1, vs34, vs49
+ xvf64gerpp 0, vs40, vs48
+ xvf64gerpp 1, vs42, vs48
+#endif
.if \IsLast==1
addi AO, AO, DISP8(\Index,128)
addi BO, BO, DISP2(\Index,32)
xxpermdi vs37, vs4, vs5, 0b10
xxpermdi vs38, vs6, vs7, 0b01
xxpermdi vs39, vs6, vs7, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxlor vs0, vs32, vs32
+ xxlor vs1, vs33, vs33
+ xxlor vs2, vs34, vs34
+ xxlor vs3, vs35, vs35
+ xxlor vs4, vs36, vs36
+ xxlor vs5, vs37, vs37
+ xxlor vs6, vs38, vs38
+ xxlor vs7, vs39, vs39
+#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
xxlor vs7, vs37, vs37
xxlor vs4, vs38, vs38
xxlor vs5, vs39, vs39
-
+#endif
SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
addi CO, CO, 64
.endm
lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A
lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
- xvf64gerpp 0, vs32, vs49
- xvf64gerpp 0, vs40, vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xvf64gerpp 0, vs32, vs48
+ xvf64gerpp 0, vs40, vs49
+#else
+ xvf64gerpp 0, vs32, vs49
+ xvf64gerpp 0, vs40, vs48
+#endif
.if \IsLast==1
addi AO, AO, DISP4(\Index,64)
addi BO, BO, DISP2(\Index,32)
xxpermdi vs33, vs0, vs1, 0b10
xxpermdi vs34, vs2, vs3, 0b01
xxpermdi vs35, vs2, vs3, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ xxlor vs0, vs32, vs32
+ xxlor vs1, vs33, vs33
+ xxlor vs2, vs34, vs34
+ xxlor vs3, vs35, vs35
+#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
xxlor vs1, vs35, vs35
+#endif
SAVE2 vs0,vs1,vs2,vs3,CO,0
addi CO, CO, 32
#elif HAVE_KERNEL_4x4_VEC
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
typedef __vector unsigned char vec_t;
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
#if defined(DOUBLE)
#include "zscal_microk_power8.c"
#endif
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#elif defined(POWER10)
#if defined(DOUBLE)
#include "zscal_microk_power10.c"
#else
#include "cscal_microk_power10.c"
#endif
-#elif defined(POWER10)
-#if defined(DOUBLE)
-#include "zscal_microk_power8.c"
-#endif
#endif
#endif
"xsnegdp 33, %x10 \n\t" // -alpha_i
XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ XXMRGHD_S(33,33, %x10) // -alpha_i , alpha_i
+#else
XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i
+#endif
"lxvp 40, 0(%2) \n\t"
"lxvp 42, 32(%2) \n\t"
"xvadddp 49, 49, 39 \n\t"
"xvadddp 50, 50, %x3 \n\t"
"xvadddp 51, 51, %x4 \n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "stxv 48, 0(%2) \n\t"
+ "stxv 49, 16(%2) \n\t"
+ "stxv 50, 32(%2) \n\t"
+ "stxv 51, 48(%2) \n\t"
+#else
"stxv 49, 0(%2) \n\t"
"stxv 48, 16(%2) \n\t"
"stxv 51, 32(%2) \n\t"
"stxv 50, 48(%2) \n\t"
+#endif
"xvadddp 34, 34, %x5 \n\t"
"xvadddp 36, 36, %x7 \n\t"
"xvadddp 37, 37, %x8 \n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "stxv 34, 64(%2) \n\t"
+ "stxv 35, 80(%2) \n\t"
+ "stxv 36, 96(%2) \n\t"
+ "stxv 37, 112(%2) \n\t"
+#else
"stxv 35, 64(%2) \n\t"
"stxv 34, 80(%2) \n\t"
"stxv 37, 96(%2) \n\t"
"stxv 36, 112(%2) \n\t"
-
+#endif
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -8 \n\t"
"xvadddp 50, 50, %x3 \n\t"
"xvadddp 51, 51, %x4 \n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "stxv 48, 0(%2) \n\t"
+ "stxv 49, 16(%2) \n\t"
+ "stxv 50, 32(%2) \n\t"
+ "stxv 51, 48(%2) \n\t"
+#else
"stxv 49, 0(%2) \n\t"
"stxv 48, 16(%2) \n\t"
"stxv 51, 32(%2) \n\t"
"stxv 50, 48(%2) \n\t"
-
+#endif
"xvadddp 34, 34, %x5 \n\t"
"xvadddp 35, 35, %x6 \n\t"
"xvadddp 36, 36, %x7 \n\t"
"xvadddp 37, 37, %x8 \n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ "stxv 34, 64(%2) \n\t"
+ "stxv 35, 80(%2) \n\t"
+ "stxv 36, 96(%2) \n\t"
+ "stxv 37, 112(%2) \n\t"
+#else
"stxv 35, 64(%2) \n\t"
"stxv 34, 80(%2) \n\t"
"stxv 37, 96(%2) \n\t"
"stxv 36, 112(%2) \n\t"
-
+#endif
"#n=%1 x=%0=%2 alpha=(%9,%10) \n"
:
"+m" (*x),
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "zswap_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#elif defined(POWER10)
#include "cswap_microk_power10.c"
-#elif defined(POWER10)
-#include "zswap_microk_power8.c"
#endif
#endif
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8
-#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-#define DGEMM_DEFAULT_UNROLL_M 16
-#define DGEMM_DEFAULT_UNROLL_N 4
-#else
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 8
-#endif
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 8