KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1\r
.endm\r
.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast\r
- KERNEL8x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1\r
+ KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1\r
.endm\r
\r
.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast\r
\r
lxv vs24, 0(BO)\r
lxv vs28, 16(BO)\r
+ xxperm vs26, vs24, permute_mask\r
+ xxperm vs30, vs28, permute_mask \r
lxv vs0, 0(AO)\r
lxv vs1, 16(AO)\r
- lxv vs2, 32(AO)\r
- lxv vs3, 48(AO)\r
- xxperm vs26, vs24, permute_mask\r
- xxperm vs30, vs28, permute_mask \r
xxpermdi vs25, vs24, vs24,2 \r
xxpermdi vs29, vs28, vs28,2 \r
-\r
+ lxv vs2, 32(AO)\r
+ lxv vs3, 48(AO) \r
xxpermdi vs27, vs26, vs26,2 \r
xxpermdi vs31, vs30, vs30,2 \r
\r
\r
.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
\r
- lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG)\r
- lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG)\r
-\r
- lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG)\r
- lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG)\r
- lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG)\r
- lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) \r
-\r
- xxperm vs10, vs8, permute_mask\r
- xxperm vs14, vs12, permute_mask \r
- xxpermdi vs9, vs8, vs8,2 \r
- xxpermdi vs13, vs12, vs12,2 \r
-\r
- xvmaddasp vs32, vs0,vs24\r
- xvmaddasp vs33, vs1,vs24\r
- xvmaddasp vs34, vs2,vs24 \r
- xvmaddasp vs35, vs3,vs24 \r
-\r
- xvmaddasp vs36, vs0,vs25\r
- xvmaddasp vs37, vs1,vs25\r
- xvmaddasp vs38, vs2,vs25 \r
- xvmaddasp vs39, vs3,vs25 \r
-\r
- xxpermdi vs11, vs10, vs10,2 \r
- xxpermdi vs15, vs14, vs14,2 \r
-\r
- xvmaddasp vs40, vs0,vs26\r
- xvmaddasp vs41, vs1,vs26\r
- xvmaddasp vs42, vs2,vs26 \r
- xvmaddasp vs43, vs3,vs26\r
-\r
- xvmaddasp vs44, vs0,vs27\r
- xvmaddasp vs45, vs1,vs27\r
- xvmaddasp vs46, vs2,vs27 \r
- xvmaddasp vs47, vs3,vs27\r
-\r
- xvmaddasp vs48, vs0,vs28\r
- xvmaddasp vs49, vs1,vs28\r
- xvmaddasp vs50, vs2,vs28 \r
- xvmaddasp vs51, vs3,vs28 \r
-\r
- xvmaddasp vs52, vs0,vs29\r
- xvmaddasp vs53, vs1,vs29\r
- xvmaddasp vs54, vs2,vs29 \r
- xvmaddasp vs55, vs3,vs29\r
-\r
- xvmaddasp vs56, vs0,vs30\r
- xvmaddasp vs57, vs1,vs30\r
- xvmaddasp vs58, vs2,vs30 \r
- xvmaddasp vs59, vs3,vs30\r
-\r
- xvmaddasp vs60, vs0,vs31\r
- xvmaddasp vs61, vs1,vs31\r
- xvmaddasp vs62, vs2,vs31 \r
- xvmaddasp vs63, vs3,vs31 \r
-\r
- lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)\r
- lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)\r
-\r
- lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG)\r
- lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG)\r
- lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG)\r
- lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG)\r
-\r
- xxperm vs26, vs24, permute_mask\r
- xxperm vs30, vs28, permute_mask \r
- xxpermdi vs25, vs24, vs24,2 \r
- xxpermdi vs29, vs28, vs28,2 \r
- \r
-\r
- xvmaddasp vs32, vs4,vs8\r
- xvmaddasp vs33, vs5,vs8\r
- xvmaddasp vs34, vs6,vs8 \r
- xvmaddasp vs35, vs7,vs8 \r
- \r
- xvmaddasp vs36, vs4,vs9\r
- xvmaddasp vs37, vs5,vs9\r
- xvmaddasp vs38, vs6,vs9 \r
- xvmaddasp vs39, vs7,vs9\r
- \r
- xxpermdi vs27, vs26, vs26,2 \r
- xxpermdi vs31, vs30, vs30,2 \r
-\r
- xvmaddasp vs40, vs4,vs10\r
- xvmaddasp vs41, vs5,vs10\r
- xvmaddasp vs42, vs6,vs10 \r
- xvmaddasp vs43, vs7,vs10\r
-\r
- xvmaddasp vs44, vs4,vs11\r
- xvmaddasp vs45, vs5,vs11\r
- xvmaddasp vs46, vs6,vs11 \r
- xvmaddasp vs47, vs7,vs11\r
-\r
- xvmaddasp vs48, vs4,vs12\r
- xvmaddasp vs49, vs5,vs12\r
- xvmaddasp vs50, vs6,vs12 \r
- xvmaddasp vs51, vs7,vs12 \r
-\r
- xvmaddasp vs52, vs4,vs13\r
- xvmaddasp vs53, vs5,vs13\r
- xvmaddasp vs54, vs6,vs13 \r
- xvmaddasp vs55, vs7,vs13\r
-\r
- xvmaddasp vs56, vs4,vs14\r
- xvmaddasp vs57, vs5,vs14\r
- xvmaddasp vs58, vs6,vs14 \r
- xvmaddasp vs59, vs7,vs14\r
-\r
- xvmaddasp vs60, vs4,vs15\r
- xvmaddasp vs61, vs5,vs15\r
- xvmaddasp vs62, vs6,vs15 \r
- xvmaddasp vs63, vs7,vs15\r
-\r
- lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)\r
- lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)\r
-\r
- lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG)\r
- lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG)\r
- lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG)\r
- lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) \r
-\r
- xxperm vs10, vs8, permute_mask\r
- xxperm vs14, vs12, permute_mask \r
- xxpermdi vs9, vs8, vs8,2 \r
- xxpermdi vs13, vs12, vs12,2 \r
-\r
- xvmaddasp vs32, vs0,vs24\r
- xvmaddasp vs33, vs1,vs24\r
- xvmaddasp vs34, vs2,vs24 \r
- xvmaddasp vs35, vs3,vs24 \r
-\r
- xvmaddasp vs36, vs0,vs25\r
- xvmaddasp vs37, vs1,vs25\r
- xvmaddasp vs38, vs2,vs25 \r
- xvmaddasp vs39, vs3,vs25\r
-\r
- xxpermdi vs11, vs10, vs10,2 \r
- xxpermdi vs15, vs14, vs14,2 \r
-\r
- xvmaddasp vs40, vs0,vs26\r
- xvmaddasp vs41, vs1,vs26\r
- xvmaddasp vs42, vs2,vs26 \r
- xvmaddasp vs43, vs3,vs26\r
-\r
- xvmaddasp vs44, vs0,vs27\r
- xvmaddasp vs45, vs1,vs27\r
- xvmaddasp vs46, vs2,vs27 \r
- xvmaddasp vs47, vs3,vs27\r
-\r
- xvmaddasp vs48, vs0,vs28\r
- xvmaddasp vs49, vs1,vs28\r
- xvmaddasp vs50, vs2,vs28 \r
- xvmaddasp vs51, vs3,vs28 \r
-\r
- xvmaddasp vs52, vs0,vs29\r
- xvmaddasp vs53, vs1,vs29\r
- xvmaddasp vs54, vs2,vs29 \r
- xvmaddasp vs55, vs3,vs29\r
-\r
- xvmaddasp vs56, vs0,vs30\r
- xvmaddasp vs57, vs1,vs30\r
- xvmaddasp vs58, vs2,vs30 \r
- xvmaddasp vs59, vs3,vs30\r
-\r
- xvmaddasp vs60, vs0,vs31\r
- xvmaddasp vs61, vs1,vs31\r
- xvmaddasp vs62, vs2,vs31 \r
- xvmaddasp vs63, vs3,vs31 \r
- \r
-.if \Complete==0\r
- lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)\r
- lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)\r
-\r
- lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG)\r
- lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) \r
- lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG)\r
- lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG)\r
-\r
- xxperm vs26, vs24, permute_mask\r
- xxperm vs30, vs28, permute_mask \r
- xxpermdi vs25, vs24, vs24,2 \r
- xxpermdi vs29, vs28, vs28,2 \r
-\r
-.endif \r
-.if \IsLast==1 \r
-.if \Complete==1\r
- \r
- addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB)\r
- addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA)\r
-.else\r
- \r
- addi \BREG, \BREG, DISP32(\Index,128)\r
- addi \AREG, \AREG, DISP64(\Index,256)\r
-.endif\r
-.endif \r
- \r
- xvmaddasp vs32, vs4,vs8\r
- xvmaddasp vs33, vs5,vs8\r
- xvmaddasp vs34, vs6,vs8 \r
- xvmaddasp vs35, vs7,vs8 \r
- \r
- xvmaddasp vs36, vs4,vs9\r
- xvmaddasp vs37, vs5,vs9\r
- xvmaddasp vs38, vs6,vs9 \r
- xvmaddasp vs39, vs7,vs9\r
- \r
-.if \Complete==0 \r
- xxpermdi vs27, vs26, vs26,2 \r
- xxpermdi vs31, vs30, vs30,2 \r
- \r
-.endif\r
- \r
- xvmaddasp vs40, vs4,vs10\r
- xvmaddasp vs41, vs5,vs10\r
- xvmaddasp vs42, vs6,vs10 \r
- xvmaddasp vs43, vs7,vs10\r
-\r
- xvmaddasp vs44, vs4,vs11\r
- xvmaddasp vs45, vs5,vs11\r
- xvmaddasp vs46, vs6,vs11 \r
- xvmaddasp vs47, vs7,vs11\r
-\r
- xvmaddasp vs48, vs4,vs12\r
- xvmaddasp vs49, vs5,vs12\r
- xvmaddasp vs50, vs6,vs12 \r
- xvmaddasp vs51, vs7,vs12 \r
-\r
- xvmaddasp vs52, vs4,vs13\r
- xvmaddasp vs53, vs5,vs13\r
- xvmaddasp vs54, vs6,vs13 \r
- xvmaddasp vs55, vs7,vs13\r
-\r
- xvmaddasp vs56, vs4,vs14\r
- xvmaddasp vs57, vs5,vs14\r
- xvmaddasp vs58, vs6,vs14 \r
- xvmaddasp vs59, vs7,vs14\r
-\r
- xvmaddasp vs60, vs4,vs15\r
- xvmaddasp vs61, vs5,vs15\r
- xvmaddasp vs62, vs6,vs15 \r
- xvmaddasp vs63, vs7,vs15\r
+KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0\r
+KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete\r
\r
.endm\r
\r
END8x16 \First, AO, BO, 64,32 \r
.endm\r
\r
-.macro KERNEL8x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete\r
- \r
- lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)\r
+.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ lxv vs8, DISP16(\Index,\OffsetB)(\BREG)\r
lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)\r
\r
- lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)\r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs36, vs0,vs25\r
+ lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)\r
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)\r
+ xxperm vs10, vs8, permute_mask\r
+ xxperm vs14, vs12, permute_mask \r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs44, vs0,vs27\r
lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)\r
lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) \r
+ xvmaddasp vs48, vs0,vs28\r
+ xvmaddasp vs52, vs0,vs29\r
\r
- xxperm vs10, vs8, permute_mask\r
- xxperm vs14, vs12, permute_mask \r
xxpermdi vs9, vs8, vs8,2 \r
xxpermdi vs13, vs12, vs12,2 \r
-.if \First==1\r
- xvmulsp vs32, vs0,vs24\r
- xvmulsp vs33, vs1,vs24\r
- xvmulsp vs34, vs2,vs24 \r
- xvmulsp vs35, vs3,vs24 \r
\r
- xvmulsp vs36, vs0,vs25\r
- xvmulsp vs37, vs1,vs25\r
- xvmulsp vs38, vs2,vs25 \r
- xvmulsp vs39, vs3,vs25 \r
-.else\r
- xvmaddasp vs32, vs0,vs24\r
- xvmaddasp vs33, vs1,vs24\r
- xvmaddasp vs34, vs2,vs24 \r
- xvmaddasp vs35, vs3,vs24\r
-\r
- xvmaddasp vs36, vs0,vs25\r
- xvmaddasp vs37, vs1,vs25\r
- xvmaddasp vs38, vs2,vs25 \r
- xvmaddasp vs39, vs3,vs25 \r
-.endif\r
+ xvmaddasp vs56, vs0,vs30\r
+ xvmaddasp vs60, vs0,vs31\r
\r
xxpermdi vs11, vs10, vs10,2 \r
xxpermdi vs15, vs14, vs14,2 \r
- \r
-.if \First==1 \r
- xvmulsp vs40, vs0,vs26\r
- xvmulsp vs41, vs1,vs26\r
- xvmulsp vs42, vs2,vs26 \r
- xvmulsp vs43, vs3,vs26\r
-\r
- xvmulsp vs44, vs0,vs27\r
- xvmulsp vs45, vs1,vs27\r
- xvmulsp vs46, vs2,vs27 \r
- xvmulsp vs47, vs3,vs27\r
\r
- xvmulsp vs48, vs0,vs28\r
- xvmulsp vs49, vs1,vs28\r
- xvmulsp vs50, vs2,vs28 \r
- xvmulsp vs51, vs3,vs28 \r
\r
- xvmulsp vs52, vs0,vs29\r
- xvmulsp vs53, vs1,vs29\r
- xvmulsp vs54, vs2,vs29 \r
- xvmulsp vs55, vs3,vs29\r
\r
- xvmulsp vs56, vs0,vs30\r
- xvmulsp vs57, vs1,vs30\r
- xvmulsp vs58, vs2,vs30 \r
- xvmulsp vs59, vs3,vs30\r
-\r
- xvmulsp vs60, vs0,vs31\r
- xvmulsp vs61, vs1,vs31\r
- xvmulsp vs62, vs2,vs31 \r
- xvmulsp vs63, vs3,vs31\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs37, vs1,vs25\r
\r
-.else \r
- xvmaddasp vs40, vs0,vs26\r
xvmaddasp vs41, vs1,vs26\r
- xvmaddasp vs42, vs2,vs26 \r
- xvmaddasp vs43, vs3,vs26\r
-\r
- xvmaddasp vs44, vs0,vs27\r
xvmaddasp vs45, vs1,vs27\r
- xvmaddasp vs46, vs2,vs27 \r
- xvmaddasp vs47, vs3,vs27\r
-\r
- xvmaddasp vs48, vs0,vs28\r
xvmaddasp vs49, vs1,vs28\r
- xvmaddasp vs50, vs2,vs28 \r
- xvmaddasp vs51, vs3,vs28 \r
-\r
- xvmaddasp vs52, vs0,vs29\r
xvmaddasp vs53, vs1,vs29\r
- xvmaddasp vs54, vs2,vs29 \r
- xvmaddasp vs55, vs3,vs29\r
-\r
- xvmaddasp vs56, vs0,vs30\r
xvmaddasp vs57, vs1,vs30\r
- xvmaddasp vs58, vs2,vs30 \r
- xvmaddasp vs59, vs3,vs30\r
-\r
- xvmaddasp vs60, vs0,vs31\r
- xvmaddasp vs61, vs1,vs31\r
+ xvmaddasp vs61, vs1,vs31 \r
+.if \Complete==0\r
+ lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)\r
+ lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) \r
+.endif\r
+ xvmaddasp vs34, vs2,vs24\r
+ xvmaddasp vs38, vs2,vs25\r
+ xvmaddasp vs42, vs2,vs26\r
+ xvmaddasp vs46, vs2,vs27\r
+ xvmaddasp vs50, vs2,vs28\r
+ xvmaddasp vs54, vs2,vs29\r
+ xvmaddasp vs58, vs2,vs30\r
xvmaddasp vs62, vs2,vs31 \r
- xvmaddasp vs63, vs3,vs31 \r
\r
+ xvmaddasp vs35, vs3,vs24 \r
+ xvmaddasp vs39, vs3,vs25\r
+ xvmaddasp vs43, vs3,vs26\r
+ xvmaddasp vs47, vs3,vs27\r
+ xvmaddasp vs51, vs3,vs28\r
+ xvmaddasp vs55, vs3,vs29\r
+ xvmaddasp vs59, vs3,vs30\r
+ xvmaddasp vs63, vs3,vs31\r
+.if \Complete==0 \r
+ lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)\r
+ lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)\r
.endif\r
+ xvmaddasp vs32, vs4,vs8\r
+ xvmaddasp vs36, vs4,vs9\r
.if \Complete==0\r
lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)\r
lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)\r
-\r
- lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)\r
- lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)\r
- lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)\r
- lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)\r
-\r
- xxperm vs26, vs24, permute_mask\r
- xxperm vs30, vs28, permute_mask \r
- xxpermdi vs25, vs24, vs24,2 \r
- xxpermdi vs29, vs28, vs28,2 \r
-.endif \r
+.endif\r
.if \IsLast==1 \r
.if \Complete==1\r
- addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) \r
- addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)\r
+ addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) \r
+ addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)\r
\r
.else\r
- addi \BREG, \BREG, DISP16(\Index,64)\r
- addi \AREG, \AREG, DISP32(\Index,128) \r
+ addi \AREG, \AREG, DISP32(\Index,128) \r
+ addi \BREG, \BREG, DISP16(\Index,64)\r
+\r
.endif\r
+.endif \r
+ xvmaddasp vs40, vs4,vs10\r
+ xvmaddasp vs44, vs4,vs11\r
+.if \Complete==0\r
+ xxperm vs26, vs24, permute_mask\r
+ xxperm vs30, vs28, permute_mask \r
.endif\r
-\r
-.if \First==1\r
- xvmulsp vs32, vs4,vs8\r
- xvmulsp vs33, vs5,vs8\r
- xvmulsp vs34, vs6,vs8 \r
- xvmulsp vs35, vs7,vs8\r
-\r
- xvmulsp vs36, vs4,vs9\r
- xvmulsp vs37, vs5,vs9\r
- xvmulsp vs38, vs6,vs9 \r
- xvmulsp vs39, vs7,vs9\r
-.else\r
- xvmaddasp vs32, vs4,vs8\r
- xvmaddasp vs33, vs5,vs8\r
- xvmaddasp vs34, vs6,vs8 \r
- xvmaddasp vs35, vs7,vs8 \r
-\r
- xvmaddasp vs36, vs4,vs9\r
- xvmaddasp vs37, vs5,vs9\r
- xvmaddasp vs38, vs6,vs9 \r
- xvmaddasp vs39, vs7,vs9\r
+ xvmaddasp vs48, vs4,vs12\r
+ xvmaddasp vs52, vs4,vs13\r
+.if \Complete==0 \r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs29, vs28, vs28,2 \r
.endif \r
+\r
+ xvmaddasp vs56, vs4,vs14\r
+ xvmaddasp vs60, vs4,vs15\r
\r
.if \Complete==0 \r
xxpermdi vs27, vs26, vs26,2 \r
xxpermdi vs31, vs30, vs30,2 \r
- \r
-.endif\r
-.if \First==1 \r
- xvmulsp vs40, vs4,vs10\r
- xvmulsp vs41, vs5,vs10\r
- xvmulsp vs42, vs6,vs10 \r
- xvmulsp vs43, vs7,vs10\r
-\r
- xvmulsp vs44, vs4,vs11\r
- xvmulsp vs45, vs5,vs11\r
- xvmulsp vs46, vs6,vs11 \r
- xvmulsp vs47, vs7,vs11\r
-\r
- xvmulsp vs48, vs4,vs12\r
- xvmulsp vs49, vs5,vs12\r
- xvmulsp vs50, vs6,vs12 \r
- xvmulsp vs51, vs7,vs12 \r
-\r
- xvmulsp vs52, vs4,vs13\r
- xvmulsp vs53, vs5,vs13\r
- xvmulsp vs54, vs6,vs13 \r
- xvmulsp vs55, vs7,vs13\r
-\r
- xvmulsp vs56, vs4,vs14\r
- xvmulsp vs57, vs5,vs14\r
- xvmulsp vs58, vs6,vs14 \r
- xvmulsp vs59, vs7,vs14\r
-\r
- xvmulsp vs60, vs4,vs15\r
- xvmulsp vs61, vs5,vs15\r
- xvmulsp vs62, vs6,vs15 \r
- xvmulsp vs63, vs7,vs15\r
+ \r
+.endif \r
\r
-.else \r
- xvmaddasp vs40, vs4,vs10\r
+ xvmaddasp vs33, vs5,vs8\r
+ xvmaddasp vs37, vs5,vs9\r
xvmaddasp vs41, vs5,vs10\r
- xvmaddasp vs42, vs6,vs10 \r
- xvmaddasp vs43, vs7,vs10\r
-\r
- xvmaddasp vs44, vs4,vs11\r
xvmaddasp vs45, vs5,vs11\r
- xvmaddasp vs46, vs6,vs11 \r
- xvmaddasp vs47, vs7,vs11\r
-\r
- xvmaddasp vs48, vs4,vs12\r
xvmaddasp vs49, vs5,vs12\r
- xvmaddasp vs50, vs6,vs12 \r
- xvmaddasp vs51, vs7,vs12 \r
-\r
- xvmaddasp vs52, vs4,vs13\r
xvmaddasp vs53, vs5,vs13\r
- xvmaddasp vs54, vs6,vs13 \r
- xvmaddasp vs55, vs7,vs13\r
-\r
- xvmaddasp vs56, vs4,vs14\r
xvmaddasp vs57, vs5,vs14\r
- xvmaddasp vs58, vs6,vs14 \r
- xvmaddasp vs59, vs7,vs14\r
-\r
- xvmaddasp vs60, vs4,vs15\r
xvmaddasp vs61, vs5,vs15\r
- xvmaddasp vs62, vs6,vs15 \r
- xvmaddasp vs63, vs7,vs15\r
\r
-.endif\r
+ xvmaddasp vs34, vs6,vs8 \r
+ xvmaddasp vs38, vs6,vs9 \r
+ xvmaddasp vs42, vs6,vs10\r
+ xvmaddasp vs46, vs6,vs11\r
+ xvmaddasp vs50, vs6,vs12\r
+ xvmaddasp vs54, vs6,vs13\r
+ xvmaddasp vs58, vs6,vs14\r
+ xvmaddasp vs62, vs6,vs15\r
\r
+ xvmaddasp vs35, vs7,vs8 \r
+ xvmaddasp vs39, vs7,vs9 \r
+ xvmaddasp vs43, vs7,vs10\r
+ xvmaddasp vs47, vs7,vs11\r
+ xvmaddasp vs51, vs7,vs12\r
+ xvmaddasp vs55, vs7,vs13\r
+ xvmaddasp vs59, vs7,vs14\r
+ xvmaddasp vs63, vs7,vs15\r
+ \r
.endm\r
\r
\r
\r
xxmrghw vs2, vs37, vs41\r
xxmrghw vs3, vs33, vs45\r
-\r
+#ifndef TRMMKERNEL \r
+ lxv vs32, 0(CO)\r
+ lxv vs33, 16(CO) \r
+#endif \r
xxmrglw vs16, vs34, vs46\r
xxmrglw vs18, vs38, vs42 \r
\r
\r
xxmrghw vs30, vs39, vs43 \r
xxmrghw vs31, vs35, vs47\r
-\r
- xxperm vs8, vs0, save_permute_1\r
- xxperm vs10, vs1, save_permute_1\r
- xxperm vs9, vs0, save_permute_2 \r
- xxperm vs11, vs1, save_permute_2 \r
-\r
-#ifndef TRMMKERNEL \r
- lxv vs32, 0(CO)\r
- lxv vs33, 16(CO) \r
+#ifndef TRMMKERNEL \r
lxv vs34, 32(CO) \r
lxv vs35, 48(CO) \r
#endif\r
- xxlor vs25, vs24, vs24\r
- xxlor vs27, vs26, vs26 \r
-\r
+ xxperm vs8, vs0, save_permute_1\r
+ xxperm vs10, vs1, save_permute_1\r
#ifndef TRMMKERNEL \r
lxv vs36, 0(T1)\r
lxv vs37, 16(T1) \r
+#endif\r
+ xxperm vs9, vs0, save_permute_2 \r
+ xxperm vs11, vs1, save_permute_2 \r
+\r
+#ifndef TRMMKERNEL \r
lxv vs38, 32(T1) \r
lxv vs39, 48(T1) \r
#endif\r
+\r
+ xxlor vs25, vs24, vs24\r
+ xxlor vs27, vs26, vs26 \r
+\r
+\r
+\r
#ifndef TRMMKERNEL \r
lxv vs40, 0(T2)\r
lxv vs41, 16(T2) \r
+#endif\r
+\r
+ xxperm vs12, vs2, save_permute_1\r
+ xxperm vs14, vs3, save_permute_1\r
+#ifndef TRMMKERNEL \r
lxv vs42, 32(T2) \r
lxv vs43, 48(T2) \r
#endif \r
+ \r
+ xxperm vs13, vs2, save_permute_2 \r
+ xxperm vs15, vs3, save_permute_2 \r
#ifndef TRMMKERNEL \r
lxv vs44, 0(T3)\r
- lxv vs45, 16(T3) \r
+ lxv vs45, 16(T3)\r
+#endif\r
+ xxperm vs16, vs4, save_permute_1\r
+ xxperm vs18, vs5, save_permute_1\r
+#ifndef TRMMKERNEL \r
lxv vs46, 32(T3) \r
lxv vs47, 48(T3) \r
#endif \r
\r
- xxperm vs12, vs2, save_permute_1\r
- xxperm vs14, vs3, save_permute_1\r
- \r
- xxperm vs13, vs2, save_permute_2 \r
- xxperm vs15, vs3, save_permute_2 \r
+ \r
+\r
\r
- xxperm vs16, vs4, save_permute_1\r
- xxperm vs18, vs5, save_permute_1\r
\r
xxperm vs17, vs4, save_permute_2 \r
xxperm vs19, vs5, save_permute_2 \r
-\r
+#ifdef TRMMKERNEL\r
+ xvmulsp vs32, vs8, alpha_r \r
+ xvmulsp vs33, vs12, alpha_r \r
+#else \r
+ xvmaddasp vs32, vs8, alpha_r \r
+ xvmaddasp vs33, vs12, alpha_r \r
+#endif \r
xxperm vs24, vs30, save_permute_1\r
xxperm vs26, vs31, save_permute_1 \r
+\r
+ \r
+ stxv vs32, 0(CO)\r
+ stxv vs33, 16(CO) \r
+#ifdef TRMMKERNEL \r
+ xvmulsp vs34, vs16, alpha_r \r
+ xvmulsp vs35, vs24, alpha_r \r
+#else \r
+ xvmaddasp vs34, vs16, alpha_r \r
+ xvmaddasp vs35, vs24, alpha_r \r
+#endif \r
\r
xxperm vs25, vs30, save_permute_2 \r
xxperm vs27, vs31, save_permute_2 \r
\r
\r
- /* multiply add normal way */\r
- \r
-#ifdef TRMMKERNEL\r
- xvmulsp vs32, vs8, alpha_r \r
- xvmulsp vs33, vs12, alpha_r \r
- xvmulsp vs34, vs16, alpha_r \r
- xvmulsp vs35, vs24, alpha_r \r
+ stxv vs34, 32(CO) \r
+ stxv vs35, 48(CO) \r
+#ifdef TRMMKERNEL \r
xvmulsp vs36, vs9, alpha_r \r
- xvmulsp vs37, vs13, alpha_r \r
+ xvmulsp vs37, vs13, alpha_r \r
+#else \r
+ xvmaddasp vs36, vs9, alpha_r \r
+ xvmaddasp vs37, vs13, alpha_r \r
+#endif \r
+ stxv vs36, 0(T1)\r
+ stxv vs37, 16(T1)\r
+#ifdef TRMMKERNEL \r
xvmulsp vs38, vs17, alpha_r \r
xvmulsp vs39, vs25, alpha_r \r
-#else \r
- xvmaddasp vs32, vs8, alpha_r \r
- xvmaddasp vs33, vs12, alpha_r \r
- xvmaddasp vs34, vs16, alpha_r \r
- xvmaddasp vs35, vs24, alpha_r \r
- xvmaddasp vs36, vs9, alpha_r \r
- xvmaddasp vs37, vs13, alpha_r \r
+#else \r
xvmaddasp vs38, vs17, alpha_r \r
xvmaddasp vs39, vs25, alpha_r \r
#endif \r
-\r
-\r
+ stxv vs38, 32(T1) \r
+ stxv vs39, 48(T1)\r
\r
#ifdef TRMMKERNEL\r
xvmulsp vs40, vs10, alpha_r \r
- xvmulsp vs41, vs14, alpha_r \r
- xvmulsp vs42, vs18, alpha_r \r
- xvmulsp vs43, vs26, alpha_r \r
- xvmulsp vs44, vs11, alpha_r \r
- xvmulsp vs45, vs15, alpha_r \r
- xvmulsp vs46, vs19, alpha_r \r
- xvmulsp vs47, vs27, alpha_r \r
-#else\r
-\r
+ xvmulsp vs41, vs14, alpha_r \r
+#else \r
xvmaddasp vs40, vs10, alpha_r \r
xvmaddasp vs41, vs14, alpha_r \r
- xvmaddasp vs42, vs18, alpha_r \r
- xvmaddasp vs43, vs26, alpha_r \r
- xvmaddasp vs44, vs11, alpha_r \r
- xvmaddasp vs45, vs15, alpha_r \r
- xvmaddasp vs46, vs19, alpha_r \r
- xvmaddasp vs47, vs27, alpha_r \r
- \r
-#endif \r
-\r
- stxv vs32, 0(CO)\r
- stxv vs33, 16(CO) \r
- stxv vs34, 32(CO) \r
- stxv vs35, 48(CO) \r
-\r
- stxv vs36, 0(T1)\r
- stxv vs37, 16(T1) \r
- stxv vs38, 32(T1) \r
- stxv vs39, 48(T1)\r
+#endif \r
\r
stxv vs40, 0(T2)\r
stxv vs41, 16(T2) \r
+#ifdef TRMMKERNEL \r
+ xvmulsp vs42, vs18, alpha_r \r
+ xvmulsp vs43, vs26, alpha_r \r
+#else \r
+ xvmaddasp vs42, vs18, alpha_r \r
+ xvmaddasp vs43, vs26, alpha_r\r
+#endif \r
stxv vs42, 32(T2) \r
stxv vs43, 48(T2) \r
+#ifdef TRMMKERNEL \r
+ xvmulsp vs44, vs11, alpha_r \r
+ xvmulsp vs45, vs15, alpha_r \r
+#else\r
+ xvmaddasp vs44, vs11, alpha_r \r
+ xvmaddasp vs45, vs15, alpha_r \r
+#endif \r
stxv vs44, 0(T3)\r
stxv vs45, 16(T3) \r
+#ifdef TRMMKERNEL \r
+ xvmulsp vs46, vs19, alpha_r \r
+ xvmulsp vs47, vs27, alpha_r \r
+#else \r
+ xvmaddasp vs46, vs19, alpha_r \r
+ xvmaddasp vs47, vs27, alpha_r \r
+#endif \r
stxv vs46, 32(T3) \r
stxv vs47, 48(T3)\r
\r
/*****the same with the second 8X8 ****/\r
-#ifndef TRMMKERNEL\r
- \r
+ #ifndef TRMMKERNEL \r
lxv vs32, 0(T4)\r
lxv vs33, 16(T4) \r
- lxv vs34, 32(T4) \r
- lxv vs35, 48(T4) \r
- lxv vs36, 0(T5)\r
- lxv vs37, 16(T5) \r
- lxv vs38,32(T5) \r
- lxv vs39, 48(T5) \r
#endif \r
- \r
xxmrglw vs8, vs48, vs60\r
xxmrglw vs10, vs52, vs56 \r
-\r
+#ifndef TRMMKERNEL \r
+ lxv vs34, 32(T4) \r
+ lxv vs35, 48(T4) \r
+#endif \r
xxmrghw vs1, vs48, vs60\r
xxmrghw vs0, vs52, vs56\r
+#ifndef TRMMKERNEL \r
+ lxv vs36, 0(T5)\r
+ lxv vs37, 16(T5) \r
+#endif \r
xxmrglw vs12, vs49, vs61\r
xxmrglw vs14, vs53, vs57 \r
-\r
+#ifndef TRMMKERNEL \r
+ lxv vs38,32(T5) \r
+ lxv vs39, 48(T5) \r
+#endif \r
+ \r
+ xxmrghw vs2, vs53, vs57\r
+ xxmrghw vs3, vs49, vs61\r
#ifndef TRMMKERNEL \r
lxv vs40, 0(T6)\r
- lxv vs41, 16(T6) \r
- lxv vs42, 32(T6) \r
- lxv vs43, 48(T6) \r
- lxv vs44, 0(T7)\r
- lxv vs45, 16(T7) \r
- lxv vs46, 32(T7) \r
- lxv vs47, 48(T7) \r
+ lxv vs41, 16(T6)\r
#endif \r
- xxmrghw vs2, vs53, vs57\r
- xxmrghw vs3, vs49, vs61\r
-\r
xxmrglw vs16, vs50, vs62\r
xxmrglw vs18, vs54, vs58 \r
-\r
+#ifndef TRMMKERNEL \r
+ lxv vs42, 32(T6) \r
+ lxv vs43, 48(T6) \r
+#endif \r
xxlor vs9, vs8, vs8\r
xxlor vs11, vs10, vs10 \r
xxmrghw vs4, vs54, vs58\r
xxmrghw vs5, vs50, vs62\r
-\r
+#ifndef TRMMKERNEL \r
+ lxv vs44, 0(T7)\r
+ lxv vs45, 16(T7) \r
+#endif \r
xxlor vs13, vs12, vs12\r
xxlor vs15, vs14, vs14\r
\r
xxmrglw vs24, vs51, vs63\r
- xxmrglw vs26, vs55, vs59 \r
-\r
+ xxmrglw vs26, vs55, vs59 \r
+#ifndef TRMMKERNEL \r
+ lxv vs46, 32(T7) \r
+ lxv vs47, 48(T7) \r
+#endif \r
xxlor vs17, vs16, vs16\r
xxlor vs19, vs18, vs18\r
xxmrghw vs30, vs55, vs59 \r
- xxmrghw vs31, vs51, vs63\r
+ xxmrghw vs31, vs51, vs63 \r
+\r
+ \r
\r
xxperm vs8, vs0, save_permute_1\r
xxperm vs10, vs1, save_permute_1\r
xxlor vs27, vs26, vs26 \r
xxperm vs12, vs2, save_permute_1\r
xxperm vs14, vs3, save_permute_1\r
+\r
xxperm vs13, vs2, save_permute_2 \r
xxperm vs15, vs3, save_permute_2 \r
- \r
+ #ifdef TRMMKERNEL\r
+ xvmulsp vs32, vs8, alpha_r \r
+ xvmulsp vs33, vs12, alpha_r \r
+#else \r
+ xvmaddasp vs32, vs8, alpha_r \r
+ xvmaddasp vs33, vs12, alpha_r \r
+#endif \r
xxperm vs16, vs4, save_permute_1\r
xxperm vs18, vs5, save_permute_1\r
+ stxv vs32, 0(T4)\r
+ stxv vs33, 16(T4) \r
xxperm vs17, vs4, save_permute_2 \r
xxperm vs19, vs5, save_permute_2 \r
xxperm vs24, vs30, save_permute_1\r
xxperm vs25, vs30, save_permute_2 \r
xxperm vs27, vs31, save_permute_2 \r
\r
-#ifdef TRMMKERNEL\r
- xvmulsp vs32, vs8, alpha_r \r
- xvmulsp vs33, vs12, alpha_r \r
+#ifdef TRMMKERNEL \r
xvmulsp vs34, vs16, alpha_r \r
- xvmulsp vs35, vs24, alpha_r \r
+ xvmulsp vs35, vs24, alpha_r \r
+#else \r
+ xvmaddasp vs34, vs16, alpha_r \r
+ xvmaddasp vs35, vs24, alpha_r \r
+#endif \r
+ stxv vs34, 32(T4) \r
+ stxv vs35, 48(T4) \r
+\r
+#ifdef TRMMKERNEL \r
xvmulsp vs36, vs9, alpha_r \r
- xvmulsp vs37, vs13, alpha_r \r
+ xvmulsp vs37, vs13, alpha_r \r
+#else \r
+ xvmaddasp vs36, vs9, alpha_r \r
+ xvmaddasp vs37, vs13, alpha_r \r
+#endif \r
+ stxv vs36, 0(T5)\r
+ stxv vs37, 16(T5) \r
+\r
+#ifdef TRMMKERNEL \r
xvmulsp vs38, vs17, alpha_r \r
xvmulsp vs39, vs25, alpha_r \r
-#else \r
- xvmaddasp vs32, vs8, alpha_r \r
- xvmaddasp vs33, vs12, alpha_r \r
- xvmaddasp vs34, vs16, alpha_r \r
- xvmaddasp vs35, vs24, alpha_r \r
- xvmaddasp vs36, vs9, alpha_r \r
- xvmaddasp vs37, vs13, alpha_r \r
+#else \r
xvmaddasp vs38, vs17, alpha_r \r
xvmaddasp vs39, vs25, alpha_r \r
#endif \r
\r
- stxv vs32, 0(T4)\r
- stxv vs33, 16(T4) \r
- stxv vs34, 32(T4) \r
- stxv vs35, 48(T4) \r
\r
- stxv vs36, 0(T5)\r
- stxv vs37, 16(T5) \r
+\r
+ \r
stxv vs38, 32(T5) \r
stxv vs39, 48(T5)\r
\r
+\r
#ifdef TRMMKERNEL\r
xvmulsp vs40, vs10, alpha_r \r
- xvmulsp vs41, vs14, alpha_r \r
- xvmulsp vs42, vs18, alpha_r \r
- xvmulsp vs43, vs26, alpha_r \r
- xvmulsp vs44, vs11, alpha_r \r
- xvmulsp vs45, vs15, alpha_r \r
- xvmulsp vs46, vs19, alpha_r \r
- xvmulsp vs47, vs27, alpha_r \r
-#else\r
-\r
+ xvmulsp vs41, vs14, alpha_r \r
+#else \r
xvmaddasp vs40, vs10, alpha_r \r
xvmaddasp vs41, vs14, alpha_r \r
- xvmaddasp vs42, vs18, alpha_r \r
- xvmaddasp vs43, vs26, alpha_r \r
- xvmaddasp vs44, vs11, alpha_r \r
- xvmaddasp vs45, vs15, alpha_r \r
- xvmaddasp vs46, vs19, alpha_r \r
- xvmaddasp vs47, vs27, alpha_r \r
- \r
#endif \r
-\r
stxv vs40, 0(T6)\r
- stxv vs41, 16(T6) \r
+ stxv vs41, 16(T6) \r
+#ifdef TRMMKERNEL \r
+ xvmulsp vs42, vs18, alpha_r \r
+ xvmulsp vs43, vs26, alpha_r \r
+#else \r
+ xvmaddasp vs42, vs18, alpha_r \r
+ xvmaddasp vs43, vs26, alpha_r\r
+#endif \r
stxv vs42, 32(T6) \r
stxv vs43, 48(T6) \r
+#ifdef TRMMKERNEL \r
+ xvmulsp vs44, vs11, alpha_r \r
+ xvmulsp vs45, vs15, alpha_r \r
+#else\r
+ xvmaddasp vs44, vs11, alpha_r \r
+ xvmaddasp vs45, vs15, alpha_r \r
+#endif \r
+\r
stxv vs44, 0(T7)\r
stxv vs45, 16(T7) \r
+#ifdef TRMMKERNEL \r
+ xvmulsp vs46, vs19, alpha_r \r
+ xvmulsp vs47, vs27, alpha_r \r
+#else \r
+ xvmaddasp vs46, vs19, alpha_r \r
+ xvmaddasp vs47, vs27, alpha_r \r
+#endif \r
+ \r
stxv vs46, 32(T7) \r
stxv vs47, 48(T7)\r
\r
\r
xxperm vs10, vs8, permute_mask\r
xxperm vs14, vs12, permute_mask \r
- xxpermdi vs9, vs8, vs8,2 \r
- xxpermdi vs13, vs12, vs12,2 \r
\r
xvmaddasp vs32, vs0,vs24\r
xvmaddasp vs33, vs1,vs24\r
\r
+ xxpermdi vs9, vs8, vs8,2 \r
+ xxpermdi vs13, vs12, vs12,2 \r
+\r
+\r
xvmaddasp vs36, vs0,vs25\r
xvmaddasp vs37, vs1,vs25\r
\r
\r
xvmaddasp vs52, vs0,vs29\r
xvmaddasp vs53, vs1,vs29\r
-\r
+ lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)\r
+ lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)\r
xvmaddasp vs56, vs0,vs30\r
xvmaddasp vs57, vs1,vs30\r
\r
xvmaddasp vs60, vs0,vs31\r
xvmaddasp vs61, vs1,vs31\r
\r
- lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)\r
- lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)\r
+ xxperm vs26, vs24, permute_mask\r
+ xxperm vs30, vs28, permute_mask \r
\r
lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG)\r
lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG)\r
\r
- xxperm vs26, vs24, permute_mask\r
- xxperm vs30, vs28, permute_mask \r
+\r
xxpermdi vs25, vs24, vs24,2 \r
xxpermdi vs29, vs28, vs28,2 \r
\r
\r
xvmaddasp vs52, vs4,vs13\r
xvmaddasp vs53, vs5,vs13\r
-\r
+ lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)\r
+ lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)\r
xvmaddasp vs56, vs4,vs14\r
xvmaddasp vs57, vs5,vs14\r
\r
xvmaddasp vs60, vs4,vs15\r
xvmaddasp vs61, vs5,vs15\r
\r
- lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)\r
- lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)\r
+ xxperm vs10, vs8, permute_mask\r
+ xxperm vs14, vs12, permute_mask \r
+ \r
\r
lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG)\r
lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG)\r
\r
- xxperm vs10, vs8, permute_mask\r
- xxperm vs14, vs12, permute_mask \r
+ \r
xxpermdi vs9, vs8, vs8,2 \r
xxpermdi vs13, vs12, vs12,2 \r
\r
\r
xvmaddasp vs52, vs0,vs29\r
xvmaddasp vs53, vs1,vs29\r
-\r
+.if \Complete==0\r
+ lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)\r
+ lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)\r
+.endif \r
xvmaddasp vs56, vs0,vs30\r
xvmaddasp vs57, vs1,vs30\r
-\r
+.if \Complete==0\r
+ xxperm vs26, vs24, permute_mask\r
+ xxperm vs30, vs28, permute_mask \r
+.endif \r
xvmaddasp vs60, vs0,vs31\r
xvmaddasp vs61, vs1,vs31\r
\r
-.if \Complete==0\r
- lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)\r
- lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)\r
\r
+.if \Complete==0\r
lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG)\r
lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) \r
+.endif \r
\r
- xxperm vs26, vs24, permute_mask\r
- xxperm vs30, vs28, permute_mask \r
+.if \Complete==0 \r
xxpermdi vs25, vs24, vs24,2 \r
xxpermdi vs29, vs28, vs28,2 \r
\r
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
#define MY_ALIGN .align 3\r
+b ZGEMM_L2\r
\r
- srawi. J, N, 1\r
- ble ZGEMM_L2_END\r
-\r
-ZGEMM_L2_BEGIN:\r
-\r
- mr BO, B\r
- mr BBO, BBUFFER\r
- srawi. T1, K, 2\r
- ble ZGEMM_L2_COPYB1\r
-\r
-ZGEMM_L2_COPYB8:\r
-\r
- addi T2, PRE, 128\r
- dcbt BO, PRE\r
- dcbtst BBO, PRE\r
- dcbtst BBO, T2\r
- ZCOPYB_8\r
- addic. T1, T1, -1\r
-\r
- bgt ZGEMM_L2_COPYB8\r
-\r
-ZGEMM_L2_COPYB1:\r
-\r
- andi. T1, K, 3\r
- ble ZGEMM_L2_COPYB_END\r
-\r
-ZGEMM_L2_COPYB_LOOP:\r
-\r
- ZCOPYB_2\r
- addic. T1, T1, -1\r
-\r
- bgt ZGEMM_L2_COPYB_LOOP\r
-\r
-ZGEMM_L2_COPYB_END:\r
-\r
- mr CO, C\r
- mr AO, A\r
- slwi T1, LDC , 1\r
- add C, C, T1\r
- srawi. I, M, 3\r
- ble ZGEMM_L2x8_END\r
+/* MINI SUBROUTINES */\r
\r
-ZGEMM_L2x8_BEGIN:\r
\r
\r
- mr BO, BBUFFER\r
- mr T1, K\r
- addi T1,T1, -1\r
- srawi. L, T1, 5 /**(K-1) % 32x */ \r
- ZERO2x8 \r
- ble ZGEMM_L2x8_SUB0\r
- \r
-\r
-ZGEMM_L2x8_LOOP_START:\r
-\r
- LOAD2x8 0 \r
- li T2, 1024\r
- li T3, 1024+512\r
- li T4, 2048\r
- li T5, 2048+512\r
+/* 2x8 MAIN 128x+1 LOOP */ \r
+ZGEMM_L2x8_LMAIN_SUB: \r
mtctr L\r
-\r
+ LOAD2x8 0 \r
MY_ALIGN\r
ZGEMM_L2x8_LOOP:\r
- dcbt AO, PRE\r
+ dcbt AO, PRE\r
dcbt BO, PRE\r
- KERNEL2x8_L 128,64,0,0\r
- KERNEL2x8_L 128,64,1,0\r
+ KERNEL2x8_L 128,32,0,0 \r
+ KERNEL2x8_L 128,32,1,0\r
dcbt AO, T2 \r
- KERNEL2x8_L 128,64,2,0\r
- KERNEL2x8_L 128,64,3,0 \r
+ KERNEL2x8_L 128,32,2,0\r
+ KERNEL2x8_L 128,32,3,0 \r
dcbt AO, T3\r
dcbt BO, T2\r
- KERNEL2x8_L 128,64,4,0\r
- KERNEL2x8_L 128,64,5,0\r
+ KERNEL2x8_L 128,32,4,0\r
+ KERNEL2x8_L 128,32,5,0\r
dcbt AO, T4 \r
- KERNEL2x8_L 128,64,6,0\r
- KERNEL2x8_L 128,64,7,0 \r
+ KERNEL2x8_L 128,32,6,0\r
+ KERNEL2x8_L 128,32,7,0 \r
dcbt AO, T5 \r
dcbt BO, T3\r
- KERNEL2x8_L 128,64,8,0\r
- KERNEL2x8_L 128,64,9,0\r
- KERNEL2x8_L 128,64,10,0\r
- KERNEL2x8_L 128,64,11,0 \r
+ KERNEL2x8_L 128,32,8,0\r
+ KERNEL2x8_L 128,32,9,0\r
+ KERNEL2x8_L 128,32,10,0\r
+ KERNEL2x8_L 128,32,11,0 \r
dcbt BO, T4\r
- KERNEL2x8_L 128,64,12,0\r
- KERNEL2x8_L 128,64,13,0\r
- KERNEL2x8_L 128,64,14,0\r
- KERNEL2x8_L 128,64,15,1 \r
+ KERNEL2x8_L 128,32,12,0\r
+ KERNEL2x8_L 128,32,13,0\r
+ KERNEL2x8_L 128,32,14,0\r
+ KERNEL2x8_L 128,32,15,0 \r
+ KERNEL2x8_L 128,32,16,0\r
+ KERNEL2x8_L 128,32,17,0 \r
+ KERNEL2x8_L 128,32,18,0\r
+ KERNEL2x8_L 128,32,19,0 \r
+ KERNEL2x8_L 128,32,20,0\r
+ KERNEL2x8_L 128,32,21,0 \r
+ KERNEL2x8_L 128,32,22,0\r
+ KERNEL2x8_L 128,32,23,0 \r
+ KERNEL2x8_L 128,32,24,0\r
+ KERNEL2x8_L 128,32,25,0\r
+ KERNEL2x8_L 128,32,26,0\r
+ KERNEL2x8_L 128,32,27,0 \r
+ KERNEL2x8_L 128,32,28,0\r
+ KERNEL2x8_L 128,32,29,0\r
+ KERNEL2x8_L 128,32,30,0\r
+ KERNEL2x8_L 128,32,31,0 \r
+ KERNEL2x8_L 128,32,32,0\r
+ KERNEL2x8_L 128,32,33,0\r
+ KERNEL2x8_L 128,32,34,0\r
+ KERNEL2x8_L 128,32,35,0 \r
+ KERNEL2x8_L 128,32,36,0\r
+ KERNEL2x8_L 128,32,37,0\r
+ KERNEL2x8_L 128,32,38,0\r
+ KERNEL2x8_L 128,32,39,0 \r
+ KERNEL2x8_L 128,32,40,0\r
+ KERNEL2x8_L 128,32,41,0\r
+ KERNEL2x8_L 128,32,42,0\r
+ KERNEL2x8_L 128,32,43,0 \r
+ KERNEL2x8_L 128,32,44,0\r
+ KERNEL2x8_L 128,32,45,0\r
+ KERNEL2x8_L 128,32,46,0\r
+ KERNEL2x8_L 128,32,47,0 \r
+ KERNEL2x8_L 128,32,48,0\r
+ KERNEL2x8_L 128,32,49,0 \r
+ KERNEL2x8_L 128,32,50,0\r
+ KERNEL2x8_L 128,32,51,0 \r
+ KERNEL2x8_L 128,32,52,0\r
+ KERNEL2x8_L 128,32,53,0 \r
+ KERNEL2x8_L 128,32,54,0\r
+ KERNEL2x8_L 128,32,55,0 \r
+ KERNEL2x8_L 128,32,56,0\r
+ KERNEL2x8_L 128,32,57,0\r
+ KERNEL2x8_L 128,32,58,0\r
+ KERNEL2x8_L 128,32,59,0 \r
+ KERNEL2x8_L 128,32,60,0\r
+ KERNEL2x8_L 128,32,61,0\r
+ KERNEL2x8_L 128,32,62,0 \r
+ KERNEL2x8_L 128,32,63,1 \r
bdnz ZGEMM_L2x8_LOOP\r
MY_ALIGN \r
ZGEMM_L2x8_LOOP_END:\r
- END2x8 AO, BO, 128, 64 \r
- \r
- b ZGEMM_L2x8_SUB1\r
- \r
-ZGEMM_L2x8_SUB0:\r
+ END2x8 AO, BO, 128,32 \r
+ blr\r
\r
- andi. L, K, 63\r
- \r
- b ZGEMM_L2x8_SUB2\r
+ MY_ALIGN\r
+ZGEMM_2x8_L64_SUB:\r
+ LOAD2x8 0 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL2x8_L 128,32,0,0 \r
+ KERNEL2x8_L 128,32,1,0\r
+ dcbt AO, T2 \r
+ KERNEL2x8_L 128,32,2,0\r
+ KERNEL2x8_L 128,32,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL2x8_L 128,32,4,0\r
+ KERNEL2x8_L 128,32,5,0\r
+ dcbt AO, T4 \r
+ KERNEL2x8_L 128,32,6,0\r
+ KERNEL2x8_L 128,32,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL2x8_L 128,32,8,0\r
+ KERNEL2x8_L 128,32,9,0\r
+ KERNEL2x8_L 128,32,10,0\r
+ KERNEL2x8_L 128,32,11,0 \r
+ dcbt BO, T4\r
+ KERNEL2x8_L 128,32,12,0\r
+ KERNEL2x8_L 128,32,13,0\r
+ KERNEL2x8_L 128,32,14,0\r
+ KERNEL2x8_L 128,32,15,0 \r
+ KERNEL2x8_L 128,32,16,0\r
+ KERNEL2x8_L 128,32,17,0 \r
+ KERNEL2x8_L 128,32,18,0\r
+ KERNEL2x8_L 128,32,19,0 \r
+ KERNEL2x8_L 128,32,20,0\r
+ KERNEL2x8_L 128,32,21,0 \r
+ KERNEL2x8_L 128,32,22,0\r
+ KERNEL2x8_L 128,32,23,0 \r
+ KERNEL2x8_L 128,32,24,0\r
+ KERNEL2x8_L 128,32,25,0\r
+ KERNEL2x8_L 128,32,26,0\r
+ KERNEL2x8_L 128,32,27,0 \r
+ KERNEL2x8_L 128,32,28,0\r
+ KERNEL2x8_L 128,32,29,0\r
+ KERNEL2x8_L 128,32,30,0\r
+ KERNEL2x8_E 128,32,31,1\r
+ blr\r
\r
-ZGEMM_L2x8_SUB1:\r
\r
- andi. L, T1, 31\r
- ble ZGEMM_L2x8_SAVE\r
+ MY_ALIGN\r
+ZGEMM_2x8_L32_SUB:\r
+ LOAD2x8 0 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL2x8_L 128,32,0,0 \r
+ KERNEL2x8_L 128,32,1,0\r
+ dcbt AO, T2 \r
+ KERNEL2x8_L 128,32,2,0\r
+ KERNEL2x8_L 128,32,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL2x8_L 128,32,4,0\r
+ KERNEL2x8_L 128,32,5,0\r
+ dcbt AO, T4 \r
+ KERNEL2x8_L 128,32,6,0\r
+ KERNEL2x8_L 128,32,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL2x8_L 128,32,8,0\r
+ KERNEL2x8_L 128,32,9,0\r
+ KERNEL2x8_L 128,32,10,0\r
+ KERNEL2x8_L 128,32,11,0 \r
+ dcbt BO, T4\r
+ KERNEL2x8_L 128,32,12,0\r
+ KERNEL2x8_L 128,32,13,0\r
+ KERNEL2x8_L 128,32,14,0\r
+ KERNEL2x8_L 128,32,15,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+ZGEMM_2x8_L16_SUB:\r
+ LOAD2x8 0 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL2x8_L 128,32,0,0 \r
+ KERNEL2x8_L 128,32,1,0\r
+ dcbt AO, T2 \r
+ KERNEL2x8_L 128,32,2,0\r
+ KERNEL2x8_L 128,32,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL2x8_L 128,32,4,0\r
+ KERNEL2x8_L 128,32,5,0\r
+ dcbt AO, T4 \r
+ KERNEL2x8_L 128,32,6,0\r
+ KERNEL2x8_L 128,32,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+ZGEMM_2x4_LMAIN_SUB:\r
+ mtctr L\r
+ LOAD2x4 0 \r
+ MY_ALIGN\r
+ZGEMM_L2x4_LOOP: \r
+ KERNEL2x4_L 64,32,0,0\r
+ KERNEL2x4_L 64,32,1,0 \r
+ KERNEL2x4_L 64,32,2,0\r
+ KERNEL2x4_L 64,32,3,0 \r
+ KERNEL2x4_L 64,32,4,0\r
+ KERNEL2x4_L 64,32,5,0 \r
+ KERNEL2x4_L 64,32,6,0\r
+ KERNEL2x4_L 64,32,7,0\r
+ KERNEL2x4_L 64,32,8,0\r
+ KERNEL2x4_L 64,32,9,0 \r
+ KERNEL2x4_L 64,32,10,0\r
+ KERNEL2x4_L 64,32,11,0 \r
+ KERNEL2x4_L 64,32,12,0\r
+ KERNEL2x4_L 64,32,13,0 \r
+ KERNEL2x4_L 64,32,14,0\r
+ KERNEL2x4_L 64,32,15,1 \r
+ bdnz ZGEMM_L2x4_LOOP\r
+ MY_ALIGN \r
+ZGEMM_L2x4_LOOP_END:\r
+ END2x4 AO, BO, 64,32 \r
+ blr\r
+\r
+ MY_ALIGN\r
+ZGEMM_2x4_L16_SUB:\r
+ LOAD2x4 0 \r
+ KERNEL2x4_L 64,32, 0,0\r
+ KERNEL2x4_L 64,32, 1,0\r
+ KERNEL2x4_L 64,32, 2,0\r
+ KERNEL2x4_L 64,32, 3,0\r
+ KERNEL2x4_L 64,32, 4,0\r
+ KERNEL2x4_L 64,32, 5,0\r
+ KERNEL2x4_L 64,32, 6,0\r
+ KERNEL2x4_E 64,32, 7,1\r
+ blr\r
+\r
+ MY_ALIGN\r
+ZGEMM_2x4_L8_SUB:\r
+ LOAD2x4 0 \r
+ KERNEL2x4_L 64,32, 0,0\r
+ KERNEL2x4_L 64,32, 1,0\r
+ KERNEL2x4_L 64,32, 2,0\r
+ KERNEL2x4_E 64,32, 3,1\r
+ blr\r
\r
+/* MAIN LOOP BEGINS */\r
+\r
+ MY_ALIGN\r
+ZGEMM_L2:\r
+ srawi. J, N, 1\r
+ ble ZGEMM_L2_END\r
+\r
+ZGEMM_L2_BEGIN:\r
+ mr CO, C\r
+ slwi T1, LDC , 1 \r
+ add T2,C,LDC \r
+ mr AO, A \r
+ add C, C, T1\r
+ srawi. I, M, 3\r
+ ble ZGEMM_L2x8_END\r
+ dcbt CO,r0 /*just prefetch*/\r
+ dcbt T2,r0 \r
+ZGEMM_L2x8_BEGIN: \r
+ mr T1, K\r
+ mr BO, B \r
+ dcbt B, r0 \r
+ dcbt AO, r0 \r
+ /* TEMPS FOR PREFETCH */\r
+ li T2, 1024\r
+ li T3, 1024+512\r
+\r
+ addi T1,T1, -1\r
+ /* TEMPS FOR PREFETCH */ \r
+ li T4, 2048\r
+ li T5, 2048+512 \r
+ srawi. L, T1, 7 /**(K-1) % 128x */ \r
+\r
+ ZERO2x8 \r
+ ble ZGEMM_L2x8_SUB0\r
+ bl ZGEMM_L2x8_LMAIN_SUB \r
+ \r
+ andi. L, T1, 127\r
+ ble ZGEMM_L2x8_SAVE\r
+ b ZGEMM_L2x8_SUB2\r
+ \r
+ZGEMM_L2x8_SUB0: \r
+ andi. L, K, 255\r
+ cmpwi K,128\r
+ bne ZGEMM_L2x8_SUB2 \r
+ MY_ALIGN \r
+ZGEMM_L2x8_SUB2_128:\r
+ bl ZGEMM_2x8_L64_SUB\r
+ bl ZGEMM_2x8_L64_SUB \r
+ b ZGEMM_L2x8_SAVE \r
+ MY_ALIGN\r
ZGEMM_L2x8_SUB2:\r
- srawi. T1,L, 3\r
- ble ZGEMM_L2x8_SUB2_4\r
- mtctr T1\r
+ andi. T1,L, 64\r
+ ble ZGEMM_L2x8_SUB2_32\r
+ bl ZGEMM_2x8_L64_SUB\r
MY_ALIGN\r
-ZGEMM_L2x8_SUB2_LOOP:\r
+ZGEMM_L2x8_SUB2_32:\r
+ andi. T1,L, 32\r
+ ble ZGEMM_L2x8_SUB2_16 \r
+ bl ZGEMM_2x8_L32_SUB\r
+ MY_ALIGN \r
+ZGEMM_L2x8_SUB2_16:\r
+ andi. T1,L, 16\r
+ ble ZGEMM_L2x8_SUB2_8\r
+ bl ZGEMM_2x8_L16_SUB \r
+ MY_ALIGN \r
+ZGEMM_L2x8_SUB2_8:\r
+ andi. T1,L, 8\r
+ ble ZGEMM_L2x8_SUB2_4\r
LOAD2x8 0 \r
- KERNEL2x8_L 128,64, 0,0\r
- KERNEL2x8_L 128,64, 1,0\r
- KERNEL2x8_L 128,64, 2,0\r
- KERNEL2x8_E 128,64, 3,1\r
- bdnz ZGEMM_L2x8_SUB2_LOOP \r
- MY_ALIGN \r
+ KERNEL2x8_L 128,32, 0,0\r
+ KERNEL2x8_L 128,32, 1,0\r
+ KERNEL2x8_L 128,32, 2,0\r
+ KERNEL2x8_E 128,32, 3,1\r
+ MY_ALIGN \r
ZGEMM_L2x8_SUB2_4:\r
andi. T1,L, 4\r
ble ZGEMM_L2x8_SUB2_2\r
LOAD2x8 0 \r
- KERNEL2x8_L 128,64, 0,0\r
- KERNEL2x8_E 128,64, 1,1\r
+ KERNEL2x8_L 128,32, 0,0\r
+ KERNEL2x8_E 128,32, 1,1\r
MY_ALIGN\r
ZGEMM_L2x8_SUB2_2:\r
andi. T1,L, 2\r
ble ZGEMM_L2x8_SUB2_1\r
LOAD2x8 0 \r
- KERNEL2x8_E 128,64, 0,1\r
+ KERNEL2x8_E 128,32, 0,1\r
MY_ALIGN \r
ZGEMM_L2x8_SUB2_1:\r
andi. T1,L, 1\r
ble ZGEMM_L2x8_SAVE \r
- KERNEL2x8 \r
-\r
-/* addic. L, L, -1\r
- bgt ZGEMM_L2x8_SUB2_1*/\r
+ KERNEL2x8 \r
\r
ZGEMM_L2x8_SAVE:\r
-\r
+ addic. I, I, -1\r
SAVE2x8\r
\r
- addic. I, I, -1\r
bgt ZGEMM_L2x8_BEGIN\r
\r
+ andi. T2, M, 7\r
+ ble ZGEMM_L2x1_END\r
+\r
+ andi. T1, M, 4\r
+ ble ZGEMM_L2x4_END\r
+ b ZGEMM_L2x4_BEGIN\r
+ MY_ALIGN \r
ZGEMM_L2x8_END:\r
\r
ZGEMM_L2x4_BEGIN:\r
\r
andi. T1, M, 4\r
ble ZGEMM_L2x4_END\r
- mr BO, BBUFFER\r
+ mr BO, B\r
mr T1, K\r
addi T1,T1, -1\r
- srawi. L, T1, 4 /**(K-1) % 16x */ \r
- ZERO2x4 \r
- ble ZGEMM_L2x4_SUB0 \r
-\r
-ZGEMM_L2x4_LOOP_START:\r
- LOAD2x4 0 \r
- mtctr L\r
+ ZERO2x4 \r
+ srawi. L, T1, 5 /**(K-1) % 32x */ \r
\r
- MY_ALIGN\r
-ZGEMM_L2x4_LOOP: \r
- KERNEL2x4_L 64,64,0,0\r
- KERNEL2x4_L 64,64,1,0 \r
- KERNEL2x4_L 64,64,2,0\r
- KERNEL2x4_L 64,64,3,0 \r
- KERNEL2x4_L 64,64,4,0\r
- KERNEL2x4_L 64,64,5,0 \r
- KERNEL2x4_L 64,64,6,0\r
- KERNEL2x4_L 64,64,7,1 \r
- bdnz ZGEMM_L2x4_LOOP\r
- MY_ALIGN \r
-ZGEMM_L2x4_LOOP_END:\r
- END2x4 AO, BO, 64, 64 \r
- \r
- b ZGEMM_L2x4_SUB1\r
- \r
-ZGEMM_L2x4_SUB0:\r
-\r
- andi. L, K, 31\r
- \r
- b ZGEMM_L2x4_SUB2\r
-\r
-ZGEMM_L2x4_SUB1:\r
-\r
- andi. L, T1, 15\r
+ ble ZGEMM_L2x4_SUB0 \r
+ bl ZGEMM_2x4_LMAIN_SUB\r
+ andi. L, T1, 31\r
ble ZGEMM_L2x4_SAVE\r
+ b ZGEMM_L2x4_SUB2\r
\r
-ZGEMM_L2x4_SUB2:\r
- srawi. T1,L, 3\r
- ble ZGEMM_L2x4_SUB2_4\r
- mtctr T1\r
+ZGEMM_L2x4_SUB0:\r
+ andi. L, K, 63\r
+ cmpwi K,32\r
+ bne ZGEMM_L2x4_SUB2 \r
+ MY_ALIGN \r
+ZGEMM_L2x4_SUB2_32:\r
+ bl ZGEMM_2x4_L16_SUB\r
+ bl ZGEMM_2x4_L16_SUB \r
+ b ZGEMM_L2x4_SAVE \r
+ MY_ALIGN \r
+ZGEMM_L2x4_SUB2: \r
+ andi. T1,L, 16\r
+ ble ZGEMM_L2x4_SUB2_8\r
+ bl ZGEMM_2x4_L16_SUB \r
MY_ALIGN\r
-ZGEMM_L2x4_SUB2_LOOP:\r
- LOAD2x4 0 \r
- KERNEL2x4_L 64,64, 0,0\r
- KERNEL2x4_L 64,64, 1,0\r
- KERNEL2x4_L 64,64, 2,0\r
- KERNEL2x4_E 64,64, 3,1\r
- bdnz ZGEMM_L2x4_SUB2_LOOP \r
+ZGEMM_L2x4_SUB2_8: \r
+ andi. T1,L, 8\r
+ ble ZGEMM_L2x4_SUB2_4\r
+ bl ZGEMM_2x4_L8_SUB\r
MY_ALIGN \r
ZGEMM_L2x4_SUB2_4:\r
andi. T1,L, 4\r
ble ZGEMM_L2x4_SUB2_2\r
LOAD2x4 0 \r
- KERNEL2x4_L 64,64, 0,0\r
- KERNEL2x4_E 64,64, 1,1\r
+ KERNEL2x4_L 64,32, 0,0\r
+ KERNEL2x4_E 64,32, 1,1\r
MY_ALIGN\r
ZGEMM_L2x4_SUB2_2:\r
andi. T1,L, 2\r
ble ZGEMM_L2x4_SUB2_1\r
LOAD2x4 0 \r
- KERNEL2x4_E 64,64, 0,1\r
+ KERNEL2x4_E 64,32, 0,1\r
MY_ALIGN \r
ZGEMM_L2x4_SUB2_1:\r
andi. T1,L, 1\r
\r
ZGEMM_L2x4_END:\r
\r
-ZGEMM_L2x2_BEGIN:\r
-\r
+ZGEMM_L2x2_BEGIN: \r
\r
andi. T1, M, 2\r
ble ZGEMM_L2x2_END\r
- mr BO, BBUFFER\r
+ mr BO, B\r
mr T1, K\r
addi T1,T1, -1\r
srawi. L, T1, 4 /**(K-1) % 16x */ \r
\r
MY_ALIGN\r
ZGEMM_L2x2_LOOP: \r
- KERNEL2x2_L 32,64,0,0\r
- KERNEL2x2_L 32,64,1,0 \r
- KERNEL2x2_L 32,64,2,0\r
- KERNEL2x2_L 32,64,3,0 \r
- KERNEL2x2_L 32,64,4,0\r
- KERNEL2x2_L 32,64,5,0 \r
- KERNEL2x2_L 32,64,6,0\r
- KERNEL2x2_L 32,64,7,1 \r
+ KERNEL2x2_L 32,32,0,0\r
+ KERNEL2x2_L 32,32,1,0 \r
+ KERNEL2x2_L 32,32,2,0\r
+ KERNEL2x2_L 32,32,3,0 \r
+ KERNEL2x2_L 32,32,4,0\r
+ KERNEL2x2_L 32,32,5,0 \r
+ KERNEL2x2_L 32,32,6,0\r
+ KERNEL2x2_L 32,32,7,1 \r
bdnz ZGEMM_L2x2_LOOP\r
MY_ALIGN \r
ZGEMM_L2x2_LOOP_END:\r
- END2x2 AO, BO, 32, 64 \r
+ END2x2 AO, BO, 32,32 \r
\r
b ZGEMM_L2x2_SUB1\r
\r
MY_ALIGN\r
ZGEMM_L2x2_SUB2_LOOP:\r
LOAD2x2 0 \r
- KERNEL2x2_L 32,64, 0,0\r
- KERNEL2x2_L 32,64, 1,0\r
- KERNEL2x2_L 32,64, 2,0\r
- KERNEL2x2_E 32,64, 3,1\r
+ KERNEL2x2_L 32,32, 0,0\r
+ KERNEL2x2_L 32,32, 1,0\r
+ KERNEL2x2_L 32,32, 2,0\r
+ KERNEL2x2_E 32,32, 3,1\r
bdnz ZGEMM_L2x2_SUB2_LOOP \r
MY_ALIGN \r
ZGEMM_L2x2_SUB2_4:\r
andi. T1,L, 4\r
ble ZGEMM_L2x2_SUB2_2\r
LOAD2x2 0 \r
- KERNEL2x2_L 32,64, 0,0\r
- KERNEL2x2_E 32,64, 1,1\r
+ KERNEL2x2_L 32,32, 0,0\r
+ KERNEL2x2_E 32,32, 1,1\r
MY_ALIGN\r
ZGEMM_L2x2_SUB2_2:\r
andi. T1,L, 2\r
ble ZGEMM_L2x2_SUB2_1\r
LOAD2x2 0 \r
- KERNEL2x2_E 32,64, 0,1\r
+ KERNEL2x2_E 32,32, 0,1\r
MY_ALIGN \r
ZGEMM_L2x2_SUB2_1:\r
andi. T1,L, 1\r
\r
ZGEMM_L2x2_END:\r
\r
-ZGEMM_L2x1_BEGIN:\r
\r
\r
+ZGEMM_L2x1_BEGIN: \r
andi. T1, M, 1\r
ble ZGEMM_L2x1_END\r
- mr BO, BBUFFER\r
+ mr BO, B\r
mr T1, K\r
addi T1,T1, -1\r
srawi. L, T1, 4 /**(K-1) % 16x */ \r
\r
MY_ALIGN\r
ZGEMM_L2x1_LOOP: \r
- KERNEL2x1_L 16,64,0,0\r
- KERNEL2x1_L 16,64,1,0 \r
- KERNEL2x1_L 16,64,2,0\r
- KERNEL2x1_L 16,64,3,0 \r
- KERNEL2x1_L 16,64,4,0\r
- KERNEL2x1_L 16,64,5,0 \r
- KERNEL2x1_L 16,64,6,0\r
- KERNEL2x1_L 16,64,7,1 \r
+ KERNEL2x1_L 16,32,0,0\r
+ KERNEL2x1_L 16,32,1,0 \r
+ KERNEL2x1_L 16,32,2,0\r
+ KERNEL2x1_L 16,32,3,0 \r
+ KERNEL2x1_L 16,32,4,0\r
+ KERNEL2x1_L 16,32,5,0 \r
+ KERNEL2x1_L 16,32,6,0\r
+ KERNEL2x1_L 16,32,7,1 \r
bdnz ZGEMM_L2x1_LOOP\r
MY_ALIGN \r
ZGEMM_L2x1_LOOP_END:\r
- END2x1 AO, BO, 16, 64 \r
+ END2x1 AO, BO, 16,32 \r
\r
b ZGEMM_L2x1_SUB1\r
\r
MY_ALIGN\r
ZGEMM_L2x1_SUB2_LOOP:\r
LOAD2x1 0 \r
- KERNEL2x1_L 16,64, 0,0\r
- KERNEL2x1_L 16,64, 1,0\r
- KERNEL2x1_L 16,64, 2,0\r
- KERNEL2x1_E 16,64, 3,1\r
+ KERNEL2x1_L 16,32, 0,0\r
+ KERNEL2x1_L 16,32, 1,0\r
+ KERNEL2x1_L 16,32, 2,0\r
+ KERNEL2x1_E 16,32, 3,1\r
bdnz ZGEMM_L2x1_SUB2_LOOP \r
MY_ALIGN \r
ZGEMM_L2x1_SUB2_4:\r
andi. T1,L, 4\r
ble ZGEMM_L2x1_SUB2_2\r
LOAD2x1 0 \r
- KERNEL2x1_L 16,64, 0,0\r
- KERNEL2x1_E 16,64, 1,1\r
+ KERNEL2x1_L 16,32, 0,0\r
+ KERNEL2x1_E 16,32, 1,1\r
MY_ALIGN\r
ZGEMM_L2x1_SUB2_2:\r
andi. T1,L, 2\r
ble ZGEMM_L2x1_SUB2_1\r
LOAD2x1 0 \r
- KERNEL2x1_E 16,64, 0,1\r
+ KERNEL2x1_E 16,32, 0,1\r
MY_ALIGN \r
ZGEMM_L2x1_SUB2_1:\r
andi. T1,L, 1\r
andi. T1, N, 1\r
ble ZGEMM_L1_END\r
\r
- mr BO, B\r
- mr BBO, BBUFFER \r
- srawi. T1, K, 3 /*this time K/8 */\r
- ble ZGEMM_L1_COPYB1\r
-\r
-ZGEMM_L1_COPYB8:\r
-\r
- addi T2, PRE, 128\r
- dcbt BO, PRE\r
- dcbtst BBO, PRE\r
- dcbtst BBO, T2\r
- ZCOPYB_8\r
- addic. T1, T1, -1\r
-\r
- bgt ZGEMM_L1_COPYB8\r
-\r
-ZGEMM_L1_COPYB1:\r
-\r
- andi. T1, K, 7\r
- ble ZGEMM_L1_COPYB_END\r
-\r
-ZGEMM_L1_COPYB_LOOP:\r
-\r
- ZCOPYB_1\r
- addic. T1, T1, -1\r
-\r
- bgt ZGEMM_L1_COPYB_LOOP\r
-\r
-ZGEMM_L1_COPYB_END:\r
-\r
mr CO, C\r
mr AO, A\r
srawi. I, M, 3\r
ZGEMM_L1x8_BEGIN:\r
\r
\r
- mr BO, BBUFFER\r
+ mr BO, B\r
mr T1, K\r
addi T1,T1, -1\r
srawi. L, T1, 5 /**(K-1) % 32x */ \r
ZGEMM_L1x8_LOOP:\r
dcbt AO, PRE\r
dcbt BO, PRE\r
- KERNEL1x8_L 128,32,0,0\r
- KERNEL1x8_L 128,32,1,0\r
+ KERNEL1x8_L 128,16,0,0\r
+ KERNEL1x8_L 128,16,1,0\r
dcbt AO, T2 \r
- KERNEL1x8_L 128,32,2,0\r
- KERNEL1x8_L 128,32,3,0 \r
+ KERNEL1x8_L 128,16,2,0\r
+ KERNEL1x8_L 128,16,3,0 \r
dcbt AO, T3\r
dcbt BO, T2\r
- KERNEL1x8_L 128,32,4,0\r
- KERNEL1x8_L 128,32,5,0\r
+ KERNEL1x8_L 128,16,4,0\r
+ KERNEL1x8_L 128,16,5,0\r
dcbt AO, T4 \r
- KERNEL1x8_L 128,32,6,0\r
- KERNEL1x8_L 128,32,7,0 \r
+ KERNEL1x8_L 128,16,6,0\r
+ KERNEL1x8_L 128,16,7,0 \r
dcbt AO, T5 \r
dcbt BO, T3\r
- KERNEL1x8_L 128,32,8,0\r
- KERNEL1x8_L 128,32,9,0\r
- KERNEL1x8_L 128,32,10,0\r
- KERNEL1x8_L 128,32,11,0 \r
+ KERNEL1x8_L 128,16,8,0\r
+ KERNEL1x8_L 128,16,9,0\r
+ KERNEL1x8_L 128,16,10,0\r
+ KERNEL1x8_L 128,16,11,0 \r
dcbt BO, T4\r
- KERNEL1x8_L 128,32,12,0\r
- KERNEL1x8_L 128,32,13,0\r
- KERNEL1x8_L 128,32,14,0\r
- KERNEL1x8_L 128,32,15,1 \r
+ KERNEL1x8_L 128,16,12,0\r
+ KERNEL1x8_L 128,16,13,0\r
+ KERNEL1x8_L 128,16,14,0\r
+ KERNEL1x8_L 128,16,15,1 \r
bdnz ZGEMM_L1x8_LOOP\r
MY_ALIGN \r
ZGEMM_L1x8_LOOP_END:\r
- END1x8 AO, BO, 128, 32 \r
+ END1x8 AO, BO, 128,16 \r
\r
b ZGEMM_L1x8_SUB1\r
\r
MY_ALIGN\r
ZGEMM_L1x8_SUB2_LOOP:\r
LOAD1x8 0 \r
- KERNEL1x8_L 128,32, 0,0\r
- KERNEL1x8_L 128,32, 1,0\r
- KERNEL1x8_L 128,32, 2,0\r
- KERNEL1x8_E 128,32, 3,1\r
+ KERNEL1x8_L 128,16, 0,0\r
+ KERNEL1x8_L 128,16, 1,0\r
+ KERNEL1x8_L 128,16, 2,0\r
+ KERNEL1x8_E 128,16, 3,1\r
bdnz ZGEMM_L1x8_SUB2_LOOP \r
MY_ALIGN \r
ZGEMM_L1x8_SUB2_4:\r
andi. T1,L, 4\r
ble ZGEMM_L1x8_SUB2_2\r
LOAD1x8 0 \r
- KERNEL1x8_L 128,32, 0,0\r
- KERNEL1x8_E 128,32, 1,1\r
+ KERNEL1x8_L 128,16, 0,0\r
+ KERNEL1x8_E 128,16, 1,1\r
MY_ALIGN\r
ZGEMM_L1x8_SUB2_2:\r
andi. T1,L, 2\r
ble ZGEMM_L1x8_SUB2_1\r
LOAD1x8 0 \r
- KERNEL1x8_E 128,32, 0,1\r
+ KERNEL1x8_E 128,16, 0,1\r
MY_ALIGN \r
ZGEMM_L1x8_SUB2_1:\r
andi. T1,L, 1\r
ble ZGEMM_L1x8_SAVE \r
KERNEL1x8 \r
-\r
-/* addic. L, L, -1\r
- bgt ZGEMM_L1x8_SUB2_1*/\r
+ \r
\r
ZGEMM_L1x8_SAVE:\r
\r
\r
andi. T1, M, 4\r
ble ZGEMM_L1x4_END\r
- mr BO, BBUFFER\r
+ mr BO, B\r
mr T1, K\r
addi T1,T1, -1\r
srawi. L, T1, 5 /**(K-1) % 16x */ \r
\r
MY_ALIGN\r
ZGEMM_L1x4_LOOP: \r
- KERNEL1x4_L 64,32,0,0\r
- KERNEL1x4_L 64,32,1,0 \r
- KERNEL1x4_L 64,32,2,0\r
- KERNEL1x4_L 64,32,3,0 \r
- KERNEL1x4_L 64,32,4,0\r
- KERNEL1x4_L 64,32,5,0 \r
- KERNEL1x4_L 64,32,6,0\r
- KERNEL1x4_L 64,32,7,0 \r
- KERNEL1x4_L 64,32,8,0\r
- KERNEL1x4_L 64,32,9,0\r
- KERNEL1x4_L 64,32,10,0\r
- KERNEL1x4_L 64,32,11,0 \r
- KERNEL1x4_L 64,32,12,0\r
- KERNEL1x4_L 64,32,13,0\r
- KERNEL1x4_L 64,32,14,0\r
- KERNEL1x4_L 64,32,15,1 \r
+ KERNEL1x4_L 64,16,0,0\r
+ KERNEL1x4_L 64,16,1,0 \r
+ KERNEL1x4_L 64,16,2,0\r
+ KERNEL1x4_L 64,16,3,0 \r
+ KERNEL1x4_L 64,16,4,0\r
+ KERNEL1x4_L 64,16,5,0 \r
+ KERNEL1x4_L 64,16,6,0\r
+ KERNEL1x4_L 64,16,7,0 \r
+ KERNEL1x4_L 64,16,8,0\r
+ KERNEL1x4_L 64,16,9,0\r
+ KERNEL1x4_L 64,16,10,0\r
+ KERNEL1x4_L 64,16,11,0 \r
+ KERNEL1x4_L 64,16,12,0\r
+ KERNEL1x4_L 64,16,13,0\r
+ KERNEL1x4_L 64,16,14,0\r
+ KERNEL1x4_L 64,16,15,1 \r
bdnz ZGEMM_L1x4_LOOP\r
MY_ALIGN \r
ZGEMM_L1x4_LOOP_END:\r
- END1x4 AO, BO, 64, 32 \r
+ END1x4 AO, BO, 64,16 \r
\r
b ZGEMM_L1x4_SUB1\r
\r
MY_ALIGN\r
ZGEMM_L1x4_SUB2_LOOP:\r
LOAD1x4 0 \r
- KERNEL1x4_L 64,32, 0,0\r
- KERNEL1x4_L 64,32, 1,0\r
- KERNEL1x4_L 64,32, 2,0\r
- KERNEL1x4_E 64,32, 3,1\r
+ KERNEL1x4_L 64,16, 0,0\r
+ KERNEL1x4_L 64,16, 1,0\r
+ KERNEL1x4_L 64,16, 2,0\r
+ KERNEL1x4_E 64,16, 3,1\r
bdnz ZGEMM_L1x4_SUB2_LOOP \r
MY_ALIGN \r
ZGEMM_L1x4_SUB2_4:\r
andi. T1,L, 4\r
ble ZGEMM_L1x4_SUB2_2\r
LOAD1x4 0 \r
- KERNEL1x4_L 64,32, 0,0\r
- KERNEL1x4_E 64,32, 1,1\r
+ KERNEL1x4_L 64,16, 0,0\r
+ KERNEL1x4_E 64,16, 1,1\r
MY_ALIGN\r
ZGEMM_L1x4_SUB2_2:\r
andi. T1,L, 2\r
ble ZGEMM_L1x4_SUB2_1\r
LOAD1x4 0 \r
- KERNEL1x4_E 64,32, 0,1\r
+ KERNEL1x4_E 64,16, 0,1\r
MY_ALIGN \r
ZGEMM_L1x4_SUB2_1:\r
andi. T1,L, 1\r
\r
andi. T1, M, 2\r
ble ZGEMM_L1x2_END\r
- mr BO, BBUFFER\r
+ mr BO, B\r
mr T1, K\r
addi T1,T1, -1\r
srawi. L, T1, 5 /**(K-1) % 16x */ \r
\r
MY_ALIGN\r
ZGEMM_L1x2_LOOP: \r
- KERNEL1x2_L 32,32,0,0\r
- KERNEL1x2_L 32,32,1,0 \r
- KERNEL1x2_L 32,32,2,0\r
- KERNEL1x2_L 32,32,3,0 \r
- KERNEL1x2_L 32,32,4,0\r
- KERNEL1x2_L 32,32,5,0 \r
- KERNEL1x2_L 32,32,6,0\r
- KERNEL1x2_L 32,32,7,0 \r
- KERNEL1x2_L 32,32,8,0\r
- KERNEL1x2_L 32,32,9,0\r
- KERNEL1x2_L 32,32,10,0\r
- KERNEL1x2_L 32,32,11,0 \r
- KERNEL1x2_L 32,32,12,0\r
- KERNEL1x2_L 32,32,13,0\r
- KERNEL1x2_L 32,32,14,0\r
- KERNEL1x2_L 32,32,15,1 \r
+ KERNEL1x2_L 32,16,0,0\r
+ KERNEL1x2_L 32,16,1,0 \r
+ KERNEL1x2_L 32,16,2,0\r
+ KERNEL1x2_L 32,16,3,0 \r
+ KERNEL1x2_L 32,16,4,0\r
+ KERNEL1x2_L 32,16,5,0 \r
+ KERNEL1x2_L 32,16,6,0\r
+ KERNEL1x2_L 32,16,7,0 \r
+ KERNEL1x2_L 32,16,8,0\r
+ KERNEL1x2_L 32,16,9,0\r
+ KERNEL1x2_L 32,16,10,0\r
+ KERNEL1x2_L 32,16,11,0 \r
+ KERNEL1x2_L 32,16,12,0\r
+ KERNEL1x2_L 32,16,13,0\r
+ KERNEL1x2_L 32,16,14,0\r
+ KERNEL1x2_L 32,16,15,1 \r
bdnz ZGEMM_L1x2_LOOP\r
MY_ALIGN \r
ZGEMM_L1x2_LOOP_END:\r
- END1x2 AO, BO, 32, 32 \r
+ END1x2 AO, BO, 32,16 \r
\r
b ZGEMM_L1x2_SUB1\r
\r
MY_ALIGN\r
ZGEMM_L1x2_SUB2_LOOP:\r
LOAD1x2 0 \r
- KERNEL1x2_L 32,32, 0,0\r
- KERNEL1x2_L 32,32, 1,0\r
- KERNEL1x2_L 32,32, 2,0\r
- KERNEL1x2_E 32,32, 3,1\r
+ KERNEL1x2_L 32,16, 0,0\r
+ KERNEL1x2_L 32,16, 1,0\r
+ KERNEL1x2_L 32,16, 2,0\r
+ KERNEL1x2_E 32,16, 3,1\r
bdnz ZGEMM_L1x2_SUB2_LOOP \r
MY_ALIGN \r
ZGEMM_L1x2_SUB2_4:\r
andi. T1,L, 4\r
ble ZGEMM_L1x2_SUB2_2\r
LOAD1x2 0 \r
- KERNEL1x2_L 32,32, 0,0\r
- KERNEL1x2_E 32,32, 1,1\r
+ KERNEL1x2_L 32,16, 0,0\r
+ KERNEL1x2_E 32,16, 1,1\r
MY_ALIGN\r
ZGEMM_L1x2_SUB2_2:\r
andi. T1,L, 2\r
ble ZGEMM_L1x2_SUB2_1\r
LOAD1x2 0 \r
- KERNEL1x2_E 32,32, 0,1\r
+ KERNEL1x2_E 32,16, 0,1\r
MY_ALIGN \r
ZGEMM_L1x2_SUB2_1:\r
andi. T1,L, 1\r
\r
andi. T1, M, 1\r
ble ZGEMM_L1x1_END\r
- mr BO, BBUFFER\r
+ mr BO, B\r
mr T1, K\r
addi T1,T1, -1\r
srawi. L, T1, 5 /**(K-1) % 16x */ \r
\r
MY_ALIGN\r
ZGEMM_L1x1_LOOP: \r
- KERNEL1x1_L 16,32,0,0\r
- KERNEL1x1_L 16,32,1,0 \r
- KERNEL1x1_L 16,32,2,0\r
- KERNEL1x1_L 16,32,3,0 \r
- KERNEL1x1_L 16,32,4,0\r
- KERNEL1x1_L 16,32,5,0 \r
- KERNEL1x1_L 16,32,6,0\r
- KERNEL1x1_L 16,32,7,0 \r
- KERNEL1x1_L 16,32,8,0\r
- KERNEL1x1_L 16,32,9,0\r
- KERNEL1x1_L 16,32,10,0\r
- KERNEL1x1_L 16,32,11,0 \r
- KERNEL1x1_L 16,32,12,0\r
- KERNEL1x1_L 16,32,13,0\r
- KERNEL1x1_L 16,32,14,0\r
- KERNEL1x1_L 16,32,15,1 \r
+ KERNEL1x1_L 16,16,0,0\r
+ KERNEL1x1_L 16,16,1,0 \r
+ KERNEL1x1_L 16,16,2,0\r
+ KERNEL1x1_L 16,16,3,0 \r
+ KERNEL1x1_L 16,16,4,0\r
+ KERNEL1x1_L 16,16,5,0 \r
+ KERNEL1x1_L 16,16,6,0\r
+ KERNEL1x1_L 16,16,7,0 \r
+ KERNEL1x1_L 16,16,8,0\r
+ KERNEL1x1_L 16,16,9,0\r
+ KERNEL1x1_L 16,16,10,0\r
+ KERNEL1x1_L 16,16,11,0 \r
+ KERNEL1x1_L 16,16,12,0\r
+ KERNEL1x1_L 16,16,13,0\r
+ KERNEL1x1_L 16,16,14,0\r
+ KERNEL1x1_L 16,16,15,1 \r
bdnz ZGEMM_L1x1_LOOP\r
MY_ALIGN \r
ZGEMM_L1x1_LOOP_END:\r
- END1x1 AO, BO, 16, 32 \r
+ END1x1 AO, BO, 16, 16 \r
\r
b ZGEMM_L1x1_SUB1\r
\r
MY_ALIGN\r
ZGEMM_L1x1_SUB2_LOOP:\r
LOAD1x1 0 \r
- KERNEL1x1_L 16,32, 0,0\r
- KERNEL1x1_L 16,32, 1,0\r
- KERNEL1x1_L 16,32, 2,0\r
- KERNEL1x1_E 16,32, 3,1\r
+ KERNEL1x1_L 16,16, 0,0\r
+ KERNEL1x1_L 16,16, 1,0\r
+ KERNEL1x1_L 16,16, 2,0\r
+ KERNEL1x1_E 16,16, 3,1\r
bdnz ZGEMM_L1x1_SUB2_LOOP \r
MY_ALIGN \r
ZGEMM_L1x1_SUB2_4:\r
andi. T1,L, 4\r
ble ZGEMM_L1x1_SUB2_2\r
LOAD1x1 0 \r
- KERNEL1x1_L 16,32, 0,0\r
- KERNEL1x1_E 16,32, 1,1\r
+ KERNEL1x1_L 16,16, 0,0\r
+ KERNEL1x1_E 16,16, 1,1\r
MY_ALIGN\r
ZGEMM_L1x1_SUB2_2:\r
andi. T1,L, 2\r
ble ZGEMM_L1x1_SUB2_1\r
LOAD1x1 0 \r
- KERNEL1x1_E 16,32, 0,1\r
+ KERNEL1x1_E 16,16, 0,1\r
MY_ALIGN \r
ZGEMM_L1x1_SUB2_1:\r
andi. T1,L, 1\r
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
\r
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)\r
-\r
- #define XSFADD_R1 xsadddp\r
- #define XSFADD_R2 xssubdp\r
- #define XSFADD_I1 xsadddp\r
- #define XSFADD_I2 xsadddp\r
-\r
-#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)\r
-\r
- #define XSFADD_R1 xsadddp\r
- #define XSFADD_R2 xsadddp\r
- #define XSFADD_I1 xssubdp\r
- #define XSFADD_I2 xsadddp\r
-\r
-#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)\r
-\r
- #define XSFADD_R1 xsadddp\r
- #define XSFADD_R2 xsadddp\r
- #define XSFADD_I1 xsadddp\r
- #define XSFADD_I2 xssubdp\r
-\r
-#else // CC || CR || RC || RR\r
-\r
- #define XSFADD_R1 xsadddp\r
- #define XSFADD_R2 xssubdp\r
- #define XSFADD_I1 xssubdp\r
- #define XSFADD_I2 xssubdp\r
-\r
-#endif\r
-\r
-.macro AGGREGATE_INTO_COMPLEX FIRST_V, SECOND_V, OUTPUT_V\r
- AGGREGATE_INTO_COMPLEX_INNER \FIRST_V, \SECOND_V, \OUTPUT_V, vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7\r
-.endm\r
-\r
-.macro AGGREGATE_INTO_COMPLEX_INNER FIRST_V, SECOND_V, OUTPUT_V ,TEMP1,TEMP2,TEMP3,TEMP4,TEMP5,TEMP6,TEMP7,TEMP8\r
- xxlxor \TEMP1, \TEMP1, \TEMP1\r
- xxlxor \TEMP2, \TEMP2, \TEMP2\r
- \r
- xxswapd \SECOND_V, \SECOND_V // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB \r
-\r
- XSFADD_I1 \TEMP2, \TEMP2, \FIRST_V // realA*imagB\r
- XSFADD_I2 \TEMP2, \TEMP2, \SECOND_V // imagA*realB\r
-\r
- xxswapd \FIRST_V, \FIRST_V //imagA*realB, realA*realB -> realA*realB, imagA*realB \r
- xxswapd \SECOND_V, \SECOND_V // reverse to original imagA*imagB, realA*imagB \r
-\r
- XSFADD_R1 \TEMP1, \TEMP1, \FIRST_V // realA*realB\r
- XSFADD_R2 \TEMP1, \TEMP1, \SECOND_V // imagA*imagB\r
-\r
- xsmuldp \TEMP3, \TEMP2, alpha_i // imag*alpha_i\r
- xsmuldp \TEMP4, \TEMP2, alpha_r // imag*alpha_r \r
- xsmuldp \TEMP5, \TEMP1, alpha_r // real*alpha_r \r
- xsmuldp \TEMP6, \TEMP1, alpha_i // real*alpha_i\r
-\r
- xssubdp \TEMP7, \TEMP5, \TEMP3 // real*alpha_r - imag*alpha_i\r
- xsadddp \TEMP8, \TEMP6, \TEMP4 // real*alpha_i + imag*alpha_r\r
- xxpermdi \OUTPUT_V, \TEMP8, \TEMP7, 0 // merge real and imag part\r
-.endm\r
-\r
-/**********************************************************************************************\r
-* Macros for N=2 and M=8\r
-**********************************************************************************************/\r
\r
#define unit_size 16\r
#define DISP32(ind,disp) (ind*unit_size*32+disp)\r
#define DISP4(ind,disp) (ind*unit_size*4+disp)\r
#define DISP2(ind,disp) (ind*unit_size*2+disp)\r
#define DISP1(ind,disp) (ind*unit_size+disp)\r
+#define DISPX(disp) (disp)\r
+\r
+/* HELPERS FOR SAVE */\r
+\r
+/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */\r
+.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET \r
+#ifndef TRMMKERNEL \r
+ lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)\r
+ lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)\r
+ xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2\r
+ xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 \r
+#endif \r
+.endm\r
+\r
+/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/\r
+.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2\r
+ xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/\r
+ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/\r
+.endm \r
+\r
+/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/\r
+.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 \r
+ xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */\r
+ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/\r
+.endm\r
+\r
+/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/\r
+.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI\r
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) \r
+ xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR\r
+ xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI \r
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) \r
+ xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR\r
+ xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI \r
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) \r
+ xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR\r
+ xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 \r
+#else // CC || CR || RC || RR \r
+ /*we will assume {-alpha_r,-alpha_i} for this case */\r
+ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/\r
+ xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1\r
+ /*we will negate alpha image instead instead to fix sign*/\r
+ xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI \r
+#endif\r
+.endm \r
+\r
+/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */\r
+.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2\r
+#ifndef TRMMKERNEL \r
+ xvmsubadp \VSOUT1,\VSINII, alpha_i\r
+ xvmaddadp \VSOUT2,\VSINRR, alpha_i\r
+#else \r
+ xvmuldp \VSOUT1,\VSINII, alpha_i \r
+ xvmuldp \VSOUT2,\VSINRR, alpha_i\r
+#endif \r
+.endm\r
+\r
+/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */\r
+.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 \r
+ xvmsubadp \VSOUT1,\VSINRR, alpha_r\r
+ xvmaddadp \VSOUT2,\VSINII, alpha_r\r
+.endm\r
+\r
+/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */\r
+.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 \r
+ xxmrghd \VSOUT1,\VSIN2,\VSIN1\r
+ xxmrgld \VSOUT2,\VSIN2,\VSIN1\r
+.endm\r
+.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2\r
+ stxv \VSIN1, DISPX(\LOFFSET)(\REG)\r
+ stxv \VSIN2, DISPX(\LOFFSET+16)(\REG)\r
+.endm\r
+\r
+.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET\r
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3\r
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET\r
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5\r
+ LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)\r
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7\r
+ LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64)\r
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 \r
+ LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96)\r
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11\r
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5\r
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13 \r
+ AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 \r
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2\r
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15\r
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4\r
+ MULT_APLHA_PART1 vs6,vs8,vs16,vs17\r
+ MULT_APLHA_PART2 vs2,vs4,vs14,vs15 \r
+ AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13\r
+ MULT_APLHA_PART2 vs6,vs8,vs16,vs17\r
+ AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 \r
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9\r
+ MULT_APLHA_PART1 vs10,vs12, vs24,vs25\r
+ UNPACK_FOR_STORE vs16,vs17,vs3,vs5 \r
+ MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27\r
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9\r
+ MULT_APLHA_PART2 vs10,vs12,vs24,vs25\r
+ STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 \r
+ MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27\r
+ UNPACK_FOR_STORE vs24,vs25,vs10,vs12\r
+ UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3\r
+ STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12\r
+ STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3\r
+.endm\r
+\r
+.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET\r
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3\r
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET\r
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5\r
+ LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)\r
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7\r
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 \r
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 \r
+ AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 \r
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15\r
+ MULT_APLHA_PART1 vs6,vs8, vs16,vs17\r
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15 \r
+ MULT_APLHA_PART2 vs6,vs8,vs16,vs17\r
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9\r
+ UNPACK_FOR_STORE vs16,vs17,vs3,vs5\r
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9\r
+ STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5\r
+.endm\r
+\r
+\r
+.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET\r
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3\r
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET\r
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 \r
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 \r
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15 \r
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15 \r
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9 \r
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 \r
+.endm\r
+\r
+\r
+.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET\r
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3\r
+#ifndef TRMMKERNEL \r
+ lxv vs18, (\LOFFSET)(\BASE_REG) \r
+ xxmrgld vs14,vs18,vs18\r
+ xxmrghd vs15,vs18,vs18 \r
+#endif \r
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5 \r
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 \r
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15 \r
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15 \r
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9 \r
+ xxmrghd vs7,vs15,vs14 \r
+ stxv vs7, (\LOFFSET)(\BASE_REG) \r
+.endm\r
+\r
+/**********************************************************************************************\r
+* Macros for N=2 and M=8\r
+**********************************************************************************************/\r
\r
.macro Zero2x8\r
- xxlxor vs32, vs32, vs32\r
- xxlxor vs33, vs33, vs33\r
- xxlxor vs34, vs34, vs34\r
- xxlxor vs35, vs35, vs35\r
- xxlxor vs36, vs36, vs36\r
- xxlxor vs37, vs37, vs37\r
- xxlxor vs38, vs38, vs38\r
- xxlxor vs39, vs39, vs39\r
- xxlxor vs40, vs40, vs40\r
- xxlxor vs41, vs41, vs41\r
- xxlxor vs42, vs42, vs42\r
- xxlxor vs43, vs43, vs43\r
- xxlxor vs44, vs44, vs44\r
- xxlxor vs45, vs45, vs45\r
- xxlxor vs46, vs46, vs46\r
- xxlxor vs47, vs47, vs47\r
- xxlxor vs48, vs48, vs48\r
- xxlxor vs49, vs49, vs49\r
- xxlxor vs50, vs50, vs50\r
- xxlxor vs51, vs51, vs51 \r
- xxlxor vs52, vs52, vs52\r
- xxlxor vs53, vs53, vs53\r
- xxlxor vs54, vs54, vs54\r
- xxlxor vs55, vs55, vs55 \r
- xxlxor vs56, vs56, vs56\r
- xxlxor vs57, vs57, vs57\r
- xxlxor vs58, vs58, vs58\r
- xxlxor vs59, vs59, vs59 \r
- xxlxor vs60, vs60, vs60\r
- xxlxor vs61, vs61, vs61\r
- xxlxor vs62, vs62, vs62\r
- xxlxor vs63, vs63, vs63 \r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs38, vs38, vs38\r
+ xxlxor vs39, vs39, vs39\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs42, vs42, vs42\r
+ xxlxor vs43, vs43, vs43\r
+ xxlxor vs44, vs44, vs44\r
+ xxlxor vs45, vs45, vs45\r
+ xxlxor vs46, vs46, vs46\r
+ xxlxor vs47, vs47, vs47\r
+ xxlxor vs48, vs48, vs48\r
+ xxlxor vs49, vs49, vs49\r
+ xxlxor vs50, vs50, vs50\r
+ xxlxor vs51, vs51, vs51\r
+ xxlxor vs52, vs52, vs52\r
+ xxlxor vs53, vs53, vs53\r
+ xxlxor vs54, vs54, vs54\r
+ xxlxor vs55, vs55, vs55\r
+ xxlxor vs56, vs56, vs56\r
+ xxlxor vs57, vs57, vs57\r
+ xxlxor vs58, vs58, vs58\r
+ xxlxor vs59, vs59, vs59\r
+ xxlxor vs60, vs60, vs60\r
+ xxlxor vs61, vs61, vs61\r
+ xxlxor vs62, vs62, vs62\r
+ xxlxor vs63, vs63, vs63\r
.endm\r
\r
.macro LOAD2x8 Zero\r
\r
- lxv vs16, 0(BO) // load real part from B\r
- lxv vs17, 16(BO) // load imag part from B\r
- lxv vs18, 32(BO) // load real part from B\r
- lxv vs19, 48(BO) // load imag part from B\r
+ lxv vs16, 0(BO) // load real imag from B\r
+ lxv vs18, 16(BO) // load real,imag from B\r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
\r
- lxv vs0, 0(AO) // load real,imag from A\r
- lxv vs1, 16(AO) // load real,imag from A\r
- lxv vs2, 32(AO) // load real,imag from A\r
- lxv vs3, 48(AO) // load real,imag from A\r
+ lxv vs0, 0(AO) // load real,imag from A\r
+ lxv vs1, 16(AO) // load real,imag from A\r
+ lxv vs2, 32(AO) // load real,imag from A\r
+ lxv vs3, 48(AO) // load real,imag from A\r
\r
- lxv vs4, 64(AO) // load real,imag from A\r
- lxv vs5, 80(AO) // load real,imag from A\r
- lxv vs6, 96(AO) // load real,imag from A\r
- lxv vs7, 112(AO) // load real,imag from A\r
+ lxv vs4, 64(AO) // load real,imag from A\r
+ lxv vs5, 80(AO) // load real,imag from A\r
+ lxv vs6, 96(AO) // load real,imag from A\r
+ lxv vs7, 112(AO) // load real,imag from A\r
\r
.if \Zero==1\r
- Zero2x8 \r
+ Zero2x8\r
.endif\r
\r
.endm\r
\r
.macro END2x8_NORMAL\r
- END2x8 AO,BO,128,64\r
+ END2x8 AO,BO,128,32\r
.endm\r
\r
-.macro END2x8 AREG, BREG, OffsetA, OffsetB\r
+.macro END2x8 AREG, BREG, OffsetA, OffsetB\r
\r
-.if \OffsetB != 0 \r
- addi \BREG, \BREG, \OffsetB \r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
.endif\r
-.if \OffsetA != 0 \r
- addi \AREG, \AREG, \OffsetA \r
-.endif \r
-\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
- xvmaddadp vs40, vs4, vs16 // real*real, imag*real\r
- xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag\r
- xvmaddadp vs42, vs5, vs16 // real*real, imag*real\r
- xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag\r
- xvmaddadp vs44, vs6, vs16 // real*real, imag*real\r
- xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag\r
- xvmaddadp vs46, vs7, vs16 // real*real, imag*real\r
- xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag\r
-\r
- xvmaddadp vs48, vs0, vs18 // real*real, imag*real\r
- xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag\r
- xvmaddadp vs50, vs1, vs18 // real*real, imag*real\r
- xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag\r
- xvmaddadp vs52, vs2, vs18 // real*real, imag*real\r
- xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag\r
- xvmaddadp vs54, vs3, vs18 // real*real, imag*real\r
- xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag\r
- xvmaddadp vs56, vs4, vs18 // real*real, imag*real\r
- xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag\r
- xvmaddadp vs58, vs5, vs18 // real*real, imag*real\r
- xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag\r
- xvmaddadp vs60, vs6, vs18 // real*real, imag*real\r
- xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag\r
- xvmaddadp vs62, vs7, vs18 // real*real, imag*real\r
- xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag\r
-\r
-.endm\r
-\r
-.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast \r
- KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
-.endm\r
-\r
-.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast \r
- KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
-.endm\r
-\r
-.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
-\r
- lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A \r
-\r
- lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
-lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
- lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B\r
- lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B\r
- lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B\r
-\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
- xvmaddadp vs40, vs4, vs16 // real*real, imag*real\r
- xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag\r
- xvmaddadp vs42, vs5, vs16 // real*real, imag*real\r
- xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag\r
- xvmaddadp vs44, vs6, vs16 // real*real, imag*real\r
- xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag\r
- xvmaddadp vs46, vs7, vs16 // real*real, imag*real\r
- xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag\r
-\r
- xvmaddadp vs48, vs0, vs18 // real*real, imag*real\r
- xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag\r
- xvmaddadp vs50, vs1, vs18 // real*real, imag*real\r
- xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag\r
- xvmaddadp vs52, vs2, vs18 // real*real, imag*real\r
- xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag\r
- xvmaddadp vs54, vs3, vs18 // real*real, imag*real\r
- xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag\r
- xvmaddadp vs56, vs4, vs18 // real*real, imag*real\r
- xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag\r
- xvmaddadp vs58, vs5, vs18 // real*real, imag*real\r
- xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag\r
- xvmaddadp vs60, vs6, vs18 // real*real, imag*real\r
- xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag\r
- xvmaddadp vs62, vs7, vs18 // real*real, imag*real\r
- xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag\r
-\r
-.if \Complete==0\r
- lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
- lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
- lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B\r
- lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B\r
- lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B\r
- lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
.endif\r
\r
-.if \IsLast==1 \r
-.if \Complete==1\r
- addi \AREG, \AREG, DISP16(\Index,128+\OffsetA)\r
- addi \BREG, \BREG, DISP8(\Index,64+\OffsetB)\r
-.else \r
- addi \AREG, \AREG, DISP16(\Index,256)\r
- addi \BREG, \BREG, DISP8(\Index,128)\r
-.endif\r
-.endif \r
-\r
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag\r
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real\r
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag\r
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real\r
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag\r
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real\r
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag\r
- xvmaddadp vs40, vs12, vs20 // real*real, imag*real\r
- xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag\r
- xvmaddadp vs42, vs13, vs20 // real*real, imag*real\r
- xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag\r
- xvmaddadp vs44, vs14, vs20 // real*real, imag*real\r
- xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag\r
- xvmaddadp vs46, vs15, vs20 // real*real, imag*real\r
- xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag\r
-\r
- xvmaddadp vs48, vs8, vs22 // real*real, imag*real\r
- xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag\r
- xvmaddadp vs50, vs9, vs22 // real*real, imag*real\r
- xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag\r
- xvmaddadp vs52, vs10, vs22 // real*real, imag*real\r
- xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag\r
- xvmaddadp vs54, vs11, vs22 // real*real, imag*real\r
- xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag\r
- xvmaddadp vs56, vs12, vs22 // real*real, imag*real\r
- xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag\r
- xvmaddadp vs58, vs13, vs22 // real*real, imag*real\r
- xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag\r
- xvmaddadp vs60, vs14, vs22 // real*real, imag*real\r
- xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag\r
- xvmaddadp vs62, vs15, vs22 // real*real, imag*real\r
- xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag\r
-\r
-.endm\r
-\r
-.macro KERNEL2x8 \r
- LOAD2x8 0\r
- END2x8 AO, BO, 128,64 \r
-.endm\r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs48, vs0, vs18\r
\r
-.macro SAVE2x8\r
+ xvmaddadp vs34, vs1, vs16\r
+ xvmaddadp vs50, vs1, vs18\r
\r
- mr T1, CO\r
- addi T2, T1, 64\r
+ xvmaddadp vs36, vs2, vs16\r
+ xvmaddadp vs52, vs2, vs18\r
\r
-#ifndef TRMMKERNEL\r
+ xvmaddadp vs38, vs3, vs16\r
+ xvmaddadp vs54, vs3, vs18\r
\r
- lxv vs16, 0(T1)\r
- lxv vs17, 16(T1)\r
- lxv vs18, 32(T1)\r
- lxv vs19, 48(T1)\r
- lxv vs20, 0(T2)\r
- lxv vs21, 16(T2)\r
- lxv vs22, 32(T2)\r
- lxv vs23, 48(T2)\r
+ xvmaddadp vs40, vs4, vs16\r
+ xvmaddadp vs56, vs4, vs18\r
\r
-#endif\r
+ xvmaddadp vs42, vs5, vs16\r
+ xvmaddadp vs58, vs5, vs18\r
\r
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs9\r
- AGGREGATE_INTO_COMPLEX vs36,vs37,vs10\r
- AGGREGATE_INTO_COMPLEX vs38,vs39,vs11\r
- AGGREGATE_INTO_COMPLEX vs40,vs41,vs12\r
- AGGREGATE_INTO_COMPLEX vs42,vs43,vs13\r
- AGGREGATE_INTO_COMPLEX vs44,vs45,vs14\r
- AGGREGATE_INTO_COMPLEX vs46,vs47,vs15\r
-\r
-#ifndef TRMMKERNEL\r
-\r
- xvadddp vs8, vs8, vs16\r
- xvadddp vs9, vs9, vs17\r
- xvadddp vs10, vs10, vs18\r
- xvadddp vs11, vs11, vs19\r
- xvadddp vs12, vs12, vs20\r
- xvadddp vs13, vs13, vs21\r
- xvadddp vs14, vs14, vs22\r
- xvadddp vs15, vs15, vs23\r
+ xvmaddadp vs44, vs6, vs16\r
+ xvmaddadp vs60, vs6, vs18\r
\r
-#endif\r
+ xvmaddadp vs46, vs7, vs16\r
+ xvmaddadp vs62, vs7, vs18\r
\r
- stxv vs8, 0(T1)\r
- stxv vs9, 16(T1)\r
- stxv vs10, 32(T1)\r
- stxv vs11, 48(T1)\r
- stxv vs12, 0(T2)\r
- stxv vs13, 16(T2)\r
- stxv vs14, 32(T2)\r
- stxv vs15, 48(T2)\r
-\r
- add T1, T1, LDC\r
- add T2, T2, LDC\r
-\r
-#ifndef TRMMKERNEL\r
-\r
- lxv vs16, 0(T1)\r
- lxv vs17, 16(T1)\r
- lxv vs18, 32(T1)\r
- lxv vs19, 48(T1)\r
- lxv vs20, 0(T2)\r
- lxv vs21, 16(T2)\r
- lxv vs22, 32(T2)\r
- lxv vs23, 48(T2)\r
\r
-#endif\r
+ xvmaddadp vs33, vs0, vs17\r
+ xvmaddadp vs49, vs0, vs19\r
\r
- AGGREGATE_INTO_COMPLEX vs48,vs49,vs8\r
- AGGREGATE_INTO_COMPLEX vs50,vs51,vs9\r
- AGGREGATE_INTO_COMPLEX vs52,vs53,vs10\r
- AGGREGATE_INTO_COMPLEX vs54,vs55,vs11\r
- AGGREGATE_INTO_COMPLEX vs56,vs57,vs12\r
- AGGREGATE_INTO_COMPLEX vs58,vs59,vs13\r
- AGGREGATE_INTO_COMPLEX vs60,vs61,vs14\r
- AGGREGATE_INTO_COMPLEX vs62,vs63,vs15\r
-\r
-#ifndef TRMMKERNEL\r
-\r
- xvadddp vs8, vs8, vs16\r
- xvadddp vs9, vs9, vs17\r
- xvadddp vs10, vs10, vs18\r
- xvadddp vs11, vs11, vs19\r
- xvadddp vs12, vs12, vs20\r
- xvadddp vs13, vs13, vs21\r
- xvadddp vs14, vs14, vs22\r
- xvadddp vs15, vs15, vs23\r
+ xvmaddadp vs35, vs1, vs17\r
+ xvmaddadp vs51, vs1, vs19\r
\r
-#endif\r
+ xvmaddadp vs37, vs2, vs17\r
+ xvmaddadp vs53, vs2, vs19\r
\r
- stxv vs8, 0(T1)\r
- stxv vs9, 16(T1)\r
- stxv vs10, 32(T1)\r
- stxv vs11, 48(T1)\r
- stxv vs12, 0(T2)\r
- stxv vs13, 16(T2)\r
- stxv vs14, 32(T2)\r
- stxv vs15, 48(T2)\r
- \r
- addi CO, CO, 128\r
+ xvmaddadp vs39, vs3, vs17\r
+ xvmaddadp vs55, vs3, vs19\r
+\r
+ xvmaddadp vs41, vs4, vs17\r
+ xvmaddadp vs57, vs4, vs19\r
+\r
+ xvmaddadp vs43, vs5, vs17\r
+ xvmaddadp vs59, vs5, vs19\r
+\r
+ xvmaddadp vs45, vs6, vs17\r
+ xvmaddadp vs61, vs6, vs19\r
+\r
+ xvmaddadp vs47, vs7, vs17\r
+ xvmaddadp vs63, vs7, vs19\r
\r
.endm\r
\r
-/**********************************************************************************************\r
-* Macros for N=2 and M=4\r
-**********************************************************************************************/\r
+.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.endm\r
\r
-.macro Zero2x4\r
- xxlxor vs32, vs32, vs32\r
- xxlxor vs33, vs33, vs33\r
- xxlxor vs34, vs34, vs34\r
- xxlxor vs35, vs35, vs35\r
- xxlxor vs36, vs36, vs36\r
- xxlxor vs37, vs37, vs37\r
- xxlxor vs38, vs38, vs38\r
- xxlxor vs39, vs39, vs39\r
- xxlxor vs40, vs40, vs40\r
- xxlxor vs41, vs41, vs41\r
- xxlxor vs42, vs42, vs42\r
- xxlxor vs43, vs43, vs43\r
- xxlxor vs44, vs44, vs44\r
- xxlxor vs45, vs45, vs45\r
- xxlxor vs46, vs46, vs46\r
- xxlxor vs47, vs47, vs47 \r
+.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
.endm\r
\r
-.macro LOAD2x4 Zero\r
\r
- lxv vs16, 0(BO) // load real part from B\r
- lxv vs17, 16(BO) // load imag part from B\r
- lxv vs18, 32(BO) // load real part from B\r
- lxv vs19, 48(BO) // load imag part from B\r
+.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
\r
- lxv vs0, 0(AO) // load real,imag from A\r
- lxv vs1, 16(AO) // load real,imag from A\r
- lxv vs2, 32(AO) // load real,imag from A\r
- lxv vs3, 48(AO) // load real,imag from A\r
- \r
-.if \Zero==1\r
- Zero2x4 \r
-.endif\r
+ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
+ lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B\r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs48, vs0, vs18\r
+ xvmaddadp vs33, vs0, vs17\r
+ xvmaddadp vs49, vs0, vs19\r
\r
-.endm\r
+ xxswapd vs21, vs20\r
+ xxswapd vs23, vs22\r
\r
-.macro END2x4_NORMAL\r
- END2x4 AO,BO,64,64\r
-.endm\r
+ xvmaddadp vs34, vs1, vs16\r
+ xvmaddadp vs50, vs1, vs18\r
+\r
+ lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+ xvmaddadp vs35, vs1, vs17\r
+ xvmaddadp vs51, vs1, vs19\r
\r
-.macro END2x4 AREG, BREG, OffsetA, OffsetB\r
+ lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
\r
-.if \OffsetB != 0 \r
- addi \BREG, \BREG, \OffsetB \r
+ xvmaddadp vs36, vs2, vs16\r
+ xvmaddadp vs52, vs2, vs18\r
+\r
+ lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+ xvmaddadp vs37, vs2, vs17\r
+ xvmaddadp vs53, vs2, vs19\r
+\r
+ lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+.if \IsLast==1\r
+.if \Complete==1 \r
+ addi \AREG, \AREG, DISP16(\Index,128+\OffsetA)\r
+ addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)\r
+.endif\r
.endif\r
-.if \OffsetA != 0 \r
- addi \AREG, \AREG, \OffsetA \r
-.endif \r
-\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
-\r
- xvmaddadp vs40, vs0, vs18 // real*real, imag*real\r
- xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag\r
- xvmaddadp vs42, vs1, vs18 // real*real, imag*real\r
- xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag\r
- xvmaddadp vs44, vs2, vs18 // real*real, imag*real\r
- xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag\r
- xvmaddadp vs46, vs3, vs18 // real*real, imag*real\r
- xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag\r
-\r
-.endm\r
-\r
-.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast \r
- KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
-.endm\r
-\r
-.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast \r
- KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
-.endm\r
-\r
-.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
-\r
- lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
-lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
- lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B\r
- lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B\r
- lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B\r
-\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
-\r
- xvmaddadp vs40, vs0, vs18 // real*real, imag*real\r
- xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag\r
- xvmaddadp vs42, vs1, vs18 // real*real, imag*real\r
- xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag\r
- xvmaddadp vs44, vs2, vs18 // real*real, imag*real\r
- xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag\r
- xvmaddadp vs46, vs3, vs18 // real*real, imag*real\r
- xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag\r
+\r
+\r
+ xvmaddadp vs38, vs3, vs16\r
+ xvmaddadp vs54, vs3, vs18\r
\r
.if \Complete==0\r
- lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A \r
-\r
- lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B\r
- lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B\r
- lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B\r
- lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B\r
+ lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A\r
.endif\r
\r
-.if \IsLast==1 \r
-.if \Complete==1\r
- addi \AREG, \AREG, DISP8(\Index,64+\OffsetA)\r
- addi \BREG, \BREG, DISP8(\Index,64+\OffsetB)\r
-.else \r
- addi \AREG, \AREG, DISP8(\Index,128)\r
- addi \BREG, \BREG, DISP8(\Index,128)\r
-.endif\r
-.endif \r
-\r
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag\r
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real\r
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag\r
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real\r
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag\r
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real\r
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag\r
- \r
- xvmaddadp vs40, vs8, vs22 // real*real, imag*real\r
- xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag\r
- xvmaddadp vs42, vs9, vs22 // real*real, imag*real\r
- xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag\r
- xvmaddadp vs44, vs10, vs22 // real*real, imag*real\r
- xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag\r
- xvmaddadp vs46, vs11, vs22 // real*real, imag*real\r
- xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag\r
\r
-.endm\r
+ xvmaddadp vs39, vs3, vs17\r
+ xvmaddadp vs55, vs3, vs19\r
\r
-.macro KERNEL2x4 \r
- LOAD2x4 0\r
- END2x4 AO, BO, 64,64 \r
-.endm\r
+.if \Complete==0\r
+ lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif\r
+ xvmaddadp vs40, vs4, vs16\r
+ xvmaddadp vs56, vs4, vs18\r
\r
-.macro SAVE2x4\r
+ xvmaddadp vs41, vs4, vs17\r
+ xvmaddadp vs57, vs4, vs19\r
\r
- mr T1, CO\r
+ xvmaddadp vs42, vs5, vs16\r
+ xvmaddadp vs58, vs5, vs18\r
+ xvmaddadp vs43, vs5, vs17\r
+ xvmaddadp vs59, vs5, vs19\r
\r
-#ifndef TRMMKERNEL\r
+.if \Complete==0\r
+ lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif\r
\r
- lxv vs16, 0(T1)\r
- lxv vs17, 16(T1)\r
- lxv vs18, 32(T1)\r
- lxv vs19, 48(T1)\r
+ xvmaddadp vs44, vs6, vs16\r
+ xvmaddadp vs60, vs6, vs18\r
+ xvmaddadp vs45, vs6, vs17\r
+ xvmaddadp vs61, vs6, vs19\r
\r
-#endif\r
+ xvmaddadp vs46, vs7, vs16\r
+ xvmaddadp vs62, vs7, vs18\r
+ xvmaddadp vs47, vs7, vs17\r
+ xvmaddadp vs63, vs7, vs19\r
+\r
+.if \Complete==0\r
+ lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif\r
\r
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs9\r
- AGGREGATE_INTO_COMPLEX vs36,vs37,vs10\r
- AGGREGATE_INTO_COMPLEX vs38,vs39,vs11\r
+ xvmaddadp vs32, vs8, vs20\r
+ xvmaddadp vs48, vs8, vs22\r
+.if \Complete==0\r
+ lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B\r
+ lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B\r
+.endif\r
+.if \Complete==0\r
+.if \IsLast==1 \r
+ addi \AREG, \AREG, DISP16(\Index,256)\r
+ addi \BREG, \BREG, DISP4(\Index,64)\r
+.endif\r
\r
-#ifndef TRMMKERNEL\r
+.endif\r
+ xvmaddadp vs33, vs8, vs21\r
+ xvmaddadp vs49, vs8, vs23\r
\r
- xvadddp vs8, vs8, vs16\r
- xvadddp vs9, vs9, vs17\r
- xvadddp vs10, vs10, vs18\r
- xvadddp vs11, vs11, vs19\r
+.if \Complete==0\r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+.endif\r
\r
-#endif\r
+ xvmaddadp vs34, vs9, vs20\r
+ xvmaddadp vs50, vs9, vs22\r
+ xvmaddadp vs35, vs9, vs21\r
+ xvmaddadp vs51, vs9, vs23\r
\r
- stxv vs8, 0(T1)\r
- stxv vs9, 16(T1)\r
- stxv vs10, 32(T1)\r
- stxv vs11, 48(T1)\r
+ xvmaddadp vs36, vs10, vs20\r
+ xvmaddadp vs52, vs10, vs22\r
+ xvmaddadp vs37, vs10, vs21\r
+ xvmaddadp vs53, vs10, vs23\r
\r
- add T1, T1, LDC\r
+ xvmaddadp vs38, vs11, vs20\r
+ xvmaddadp vs54, vs11, vs22\r
+ xvmaddadp vs39, vs11, vs21\r
+ xvmaddadp vs55, vs11, vs23\r
\r
-#ifndef TRMMKERNEL\r
+ xvmaddadp vs40, vs12, vs20\r
+ xvmaddadp vs56, vs12, vs22\r
+ xvmaddadp vs41, vs12, vs21\r
+ xvmaddadp vs57, vs12, vs23\r
\r
- lxv vs16, 0(T1)\r
- lxv vs17, 16(T1)\r
- lxv vs18, 32(T1)\r
- lxv vs19, 48(T1)\r
+ xvmaddadp vs42, vs13, vs20\r
+ xvmaddadp vs58, vs13, vs22\r
+ xvmaddadp vs43, vs13, vs21\r
+ xvmaddadp vs59, vs13, vs23\r
\r
-#endif\r
+ xvmaddadp vs44, vs14, vs20\r
+ xvmaddadp vs60, vs14, vs22\r
+ xvmaddadp vs45, vs14, vs21\r
+ xvmaddadp vs61, vs14, vs23\r
\r
- AGGREGATE_INTO_COMPLEX vs40,vs41,vs8\r
- AGGREGATE_INTO_COMPLEX vs42,vs43,vs9\r
- AGGREGATE_INTO_COMPLEX vs44,vs45,vs10\r
- AGGREGATE_INTO_COMPLEX vs46,vs47,vs11\r
+ xvmaddadp vs46, vs15, vs20\r
+ xvmaddadp vs62, vs15, vs22\r
+ xvmaddadp vs47, vs15, vs21\r
+ xvmaddadp vs63, vs15, vs23\r
\r
-#ifndef TRMMKERNEL\r
+.endm\r
\r
- xvadddp vs8, vs8, vs16\r
- xvadddp vs9, vs9, vs17\r
- xvadddp vs10, vs10, vs18\r
- xvadddp vs11, vs11, vs19\r
+.macro KERNEL2x8\r
+ LOAD2x8 0\r
+ END2x8 AO, BO, 128,32\r
+.endm\r
\r
-#endif\r
+.macro SAVE2x8\r
\r
- stxv vs8, 0(T1)\r
- stxv vs9, 16(T1)\r
- stxv vs10, 32(T1)\r
- stxv vs11, 48(T1)\r
- \r
- addi CO, CO, 64\r
+ add T1, CO ,LDC \r
+ SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0\r
+ SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 \r
+ addi CO, CO, 128\r
\r
.endm\r
\r
/**********************************************************************************************\r
-* Macros for N=2 and M=2\r
+* Macros for N=2 and M=4\r
**********************************************************************************************/\r
\r
-.macro Zero2x2\r
- xxlxor vs32, vs32, vs32\r
- xxlxor vs33, vs33, vs33\r
- xxlxor vs34, vs34, vs34\r
- xxlxor vs35, vs35, vs35\r
- xxlxor vs36, vs36, vs36\r
- xxlxor vs37, vs37, vs37\r
- xxlxor vs38, vs38, vs38\r
- xxlxor vs39, vs39, vs39 \r
+.macro Zero2x4\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs38, vs38, vs38\r
+ xxlxor vs39, vs39, vs39\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs42, vs42, vs42\r
+ xxlxor vs43, vs43, vs43\r
+ xxlxor vs44, vs44, vs44\r
+ xxlxor vs45, vs45, vs45\r
+ xxlxor vs46, vs46, vs46\r
+ xxlxor vs47, vs47, vs47\r
.endm\r
\r
-.macro LOAD2x2 Zero\r
+.macro LOAD2x4 Zero\r
\r
- lxv vs16, 0(BO) // load real part from B\r
- lxv vs17, 16(BO) // load imag part from B\r
- lxv vs18, 32(BO) // load real part from B\r
- lxv vs19, 48(BO) // load imag part from B\r
+ lxv vs16, 0(BO) // load real imag from B\r
+ lxv vs18, 16(BO) // load real,imag from B\r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+\r
+ lxv vs0, 0(AO) // load real,imag from A\r
+ lxv vs1, 16(AO) // load real,imag from A\r
+ lxv vs2, 32(AO) // load real,imag from A\r
+ lxv vs3, 48(AO) // load real,imag from A\r
\r
- lxv vs0, 0(AO) // load real,imag from A\r
- lxv vs1, 16(AO) // load real,imag from A \r
- \r
.if \Zero==1\r
- Zero2x2 \r
+ Zero2x4\r
.endif\r
\r
.endm\r
\r
-.macro END2x2_NORMAL\r
- END2x2 AO,BO,32,64\r
+.macro END2x4_NORMAL\r
+ END2x4 AO,BO,64,32\r
.endm\r
\r
-.macro END2x2 AREG, BREG, OffsetA, OffsetB\r
+.macro END2x4 AREG, BREG, OffsetA, OffsetB\r
\r
-.if \OffsetB != 0 \r
- addi \BREG, \BREG, \OffsetB \r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
.endif\r
-.if \OffsetA != 0 \r
- addi \AREG, \AREG, \OffsetA \r
-.endif \r
\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag \r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17\r
+ xvmaddadp vs40, vs0, vs18\r
+ xvmaddadp vs41, vs0, vs19\r
\r
- xvmaddadp vs36, vs0, vs18 // real*real, imag*real\r
- xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag\r
- xvmaddadp vs38, vs1, vs18 // real*real, imag*real\r
- xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag \r
- \r
-.endm\r
+ xvmaddadp vs34, vs1, vs16\r
+ xvmaddadp vs35, vs1, vs17\r
+ xvmaddadp vs42, vs1, vs18\r
+ xvmaddadp vs43, vs1, vs19\r
+ \r
+ xvmaddadp vs36, vs2, vs16\r
+ xvmaddadp vs37, vs2, vs17\r
+ xvmaddadp vs44, vs2, vs18\r
+ xvmaddadp vs45, vs2, vs19\r
\r
-.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast \r
- KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
-.endm\r
+ xvmaddadp vs38, vs3, vs16\r
+ xvmaddadp vs39, vs3, vs17\r
+ xvmaddadp vs46, vs3, vs18\r
+ xvmaddadp vs47, vs3, vs19\r
\r
-.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast \r
- KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
.endm\r
\r
-.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
-\r
- lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.endm\r
\r
-lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
- lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B\r
- lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B\r
- lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B\r
+.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.endm\r
\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag \r
+.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
\r
- xvmaddadp vs36, vs0, vs18 // real*real, imag*real\r
- xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag\r
- xvmaddadp vs38, vs1, vs18 // real*real, imag*real\r
- xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag \r
+ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
+ lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B\r
+ \r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17\r
+ xxswapd vs21, vs20\r
+ xxswapd vs23, vs22 \r
+ lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+ xvmaddadp vs40, vs0, vs18\r
+ xvmaddadp vs41, vs0, vs19\r
+ lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
+.if \IsLast==1\r
+.if \Complete==1\r
+ addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) \r
+ addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) \r
+.endif\r
+.endif\r
\r
+ xvmaddadp vs34, vs1, vs16\r
+ xvmaddadp vs35, vs1, vs17\r
+ xvmaddadp vs42, vs1, vs18\r
+ xvmaddadp vs43, vs1, vs19\r
+ \r
+ xvmaddadp vs36, vs2, vs16\r
+ xvmaddadp vs37, vs2, vs17\r
.if \Complete==0\r
- lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A \r
+ lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
+ xvmaddadp vs44, vs2, vs18\r
+ xvmaddadp vs45, vs2, vs19\r
+ \r
+ xvmaddadp vs38, vs3, vs16\r
+ xvmaddadp vs39, vs3, vs17\r
+ xvmaddadp vs46, vs3, vs18\r
+ xvmaddadp vs47, vs3, vs19\r
+\r
+\r
+.if \Complete==0 \r
+ lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
\r
- lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B\r
- lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B\r
- lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B\r
- lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B\r
.endif\r
-\r
-.if \IsLast==1 \r
-.if \Complete==1\r
- addi \AREG, \AREG, DISP4(\Index,32+\OffsetA)\r
- addi \BREG, \BREG, DISP8(\Index,64+\OffsetB)\r
-.else \r
- addi \AREG, \AREG, DISP4(\Index,64)\r
- addi \BREG, \BREG, DISP8(\Index,128)\r
+ xvmaddadp vs32, vs8, vs20\r
+ xvmaddadp vs33, vs8, vs21\r
+.if \Complete==0\r
+ lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B\r
+ lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B\r
+.if \IsLast==1 \r
+ addi \AREG, \AREG, DISP8(\Index,128)\r
+ addi \BREG, \BREG, DISP4(\Index,64) \r
+.endif \r
.endif\r
-.endif \r
\r
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag\r
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real\r
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag \r
+.if \Complete==0\r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+.endif\r
\r
- xvmaddadp vs36, vs8, vs22 // real*real, imag*real\r
- xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag\r
- xvmaddadp vs38, vs9, vs22 // real*real, imag*real\r
- xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag \r
- \r
+ xvmaddadp vs40, vs8, vs22\r
+ xvmaddadp vs41, vs8, vs23\r
+\r
+ xvmaddadp vs34, vs9, vs20\r
+ xvmaddadp vs35, vs9, vs21\r
+ xvmaddadp vs42, vs9, vs22\r
+ xvmaddadp vs43, vs9, vs23\r
+\r
+ xvmaddadp vs36, vs10, vs20\r
+ xvmaddadp vs37, vs10, vs21\r
+ xvmaddadp vs44, vs10, vs22\r
+ xvmaddadp vs45, vs10, vs23\r
+\r
+ xvmaddadp vs38, vs11, vs20\r
+ xvmaddadp vs39, vs11, vs21\r
+ xvmaddadp vs46, vs11, vs22\r
+ xvmaddadp vs47, vs11, vs23\r
+\r
.endm\r
\r
-.macro KERNEL2x2 \r
- LOAD2x2 0\r
- END2x2 AO, BO, 32,64 \r
+.macro KERNEL2x4\r
+ LOAD2x4 0\r
+ END2x4 AO, BO, 64,32\r
.endm\r
\r
-.macro SAVE2x2\r
+.macro SAVE2x4 \r
+ add T1, CO ,LDC \r
+ SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0\r
+ SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 \r
+ addi CO, CO, 64\r
\r
- mr T1, CO\r
+.endm\r
\r
-#ifndef TRMMKERNEL\r
+/**********************************************************************************************\r
+* Macros for N=2 and M=2\r
+**********************************************************************************************/\r
\r
- lxv vs16, 0(T1)\r
- lxv vs17, 16(T1)\r
+.macro Zero2x2\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs38, vs38, vs38\r
+ xxlxor vs39, vs39, vs39\r
+.endm\r
\r
-#endif\r
+.macro LOAD2x2 Zero\r
\r
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 \r
+ lxv vs16, 0(BO) // load real imag from B\r
+ lxv vs18, 16(BO) // load real,imag from B\r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
\r
-#ifndef TRMMKERNEL\r
+ lxv vs0, 0(AO) // load real,imag from A\r
+ lxv vs1, 16(AO) // load real,imag from A\r
\r
- xvadddp vs8, vs8, vs16\r
- xvadddp vs9, vs9, vs17\r
\r
-#endif\r
+.if \Zero==1\r
+ Zero2x2\r
+.endif \r
+.endm\r
\r
- stxv vs8, 0(T1)\r
- stxv vs9, 16(T1)\r
+.macro END2x2_NORMAL\r
+ END2x2 AO,BO,32,32\r
+.endm\r
\r
- add T1, T1, LDC\r
+.macro END2x2 AREG, BREG, OffsetA, OffsetB\r
\r
-#ifndef TRMMKERNEL\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
\r
- lxv vs16, 0(T1)\r
- lxv vs17, 16(T1)\r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17\r
+ xvmaddadp vs36, vs0, vs18\r
+ xvmaddadp vs37, vs0, vs19\r
\r
-#endif\r
+ xvmaddadp vs34, vs1, vs16\r
+ xvmaddadp vs35, vs1, vs17 \r
+ xvmaddadp vs38, vs1, vs18\r
+ xvmaddadp vs39, vs1, vs19\r
\r
- AGGREGATE_INTO_COMPLEX vs36,vs37,vs8\r
- AGGREGATE_INTO_COMPLEX vs38,vs39,vs9\r
+.endm\r
\r
-#ifndef TRMMKERNEL\r
+.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.endm\r
\r
- xvadddp vs8, vs8, vs16\r
- xvadddp vs9, vs9, vs17\r
+.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.endm\r
\r
-#endif\r
+.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
\r
- stxv vs8, 0(T1)\r
- stxv vs9, 16(T1)\r
- \r
- addi CO, CO, 32\r
+ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
+ lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B\r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17\r
+ xxswapd vs21, vs20\r
+ xxswapd vs23, vs22\r
+\r
+ lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+.if \IsLast==1\r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP4(\Index,32+\OffsetA)\r
+ addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) \r
+.endif\r
+.endif \r
+ xvmaddadp vs36, vs0, vs18\r
+ xvmaddadp vs37, vs0, vs19\r
+\r
+ xvmaddadp vs34, vs1, vs16\r
+ xvmaddadp vs35, vs1, vs17 \r
+ xvmaddadp vs38, vs1, vs18\r
+ xvmaddadp vs39, vs1, vs19\r
+\r
+.if \Complete==0\r
+ lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A\r
+.endif\r
+.if \Complete==0\r
+ lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B\r
+ lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B\r
+.if \IsLast==1 \r
+ addi \AREG, \AREG, DISP4(\Index,64)\r
+ addi \BREG, \BREG, DISP4(\Index,64)\r
+.endif \r
+.endif\r
\r
+ xvmaddadp vs32, vs8, vs20\r
+ xvmaddadp vs33, vs8, vs21\r
+\r
+.if \Complete==0\r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+.endif \r
+ xvmaddadp vs36, vs8, vs22\r
+ xvmaddadp vs37, vs8, vs23\r
+\r
+ xvmaddadp vs34, vs9, vs20\r
+ xvmaddadp vs35, vs9, vs21 \r
+\r
+ xvmaddadp vs38, vs9, vs22\r
+ xvmaddadp vs39, vs9, vs23\r
+\r
+.endm\r
+\r
+.macro KERNEL2x2\r
+ LOAD2x2 0\r
+ END2x2 AO, BO, 32,32\r
+.endm\r
+\r
+.macro SAVE2x2 \r
+ add T1, CO ,LDC \r
+ SAVE2 vs32,vs33,vs34,vs35,CO,0\r
+ SAVE2 vs36,vs37,vs38,vs39,T1,0 \r
+ addi CO, CO, 32 \r
.endm\r
\r
/**********************************************************************************************\r
**********************************************************************************************/\r
\r
.macro Zero2x1\r
- xxlxor vs32, vs32, vs32\r
- xxlxor vs33, vs33, vs33\r
- xxlxor vs34, vs34, vs34\r
- xxlxor vs35, vs35, vs35 \r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
.endm\r
\r
.macro LOAD2x1 Zero\r
- lxv vs0, 0(AO) // load real,imag from A \r
+ lxv vs0, 0(AO) // load real,imag from A\r
\r
- lxv vs16, 0(BO) // load real part from B\r
- lxv vs17, 16(BO) // load imag part from B\r
- lxv vs18, 32(BO) // load real part from B\r
- lxv vs19, 48(BO) // load imag part from B\r
+ lxv vs16, 0(BO) // load real imag from B\r
+ lxv vs18, 16(BO) // load real,imag from B\r
\r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
.if \Zero==1\r
- Zero2x1 \r
-.endif\r
-\r
+ Zero2x1\r
+.endif \r
.endm\r
\r
.macro END2x1_NORMAL\r
- END2x1 AO,BO,16,64\r
+ END2x1 AO,BO,16,32\r
.endm\r
\r
-.macro END2x1 AREG, BREG, OffsetA, OffsetB\r
+.macro END2x1 AREG, BREG, OffsetA, OffsetB\r
\r
-.if \OffsetA != 0 \r
- addi \AREG, \AREG, \OffsetA \r
-.endif \r
-.if \OffsetB != 0 \r
- addi \BREG, \BREG, \OffsetB \r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
.endif\r
\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag \r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17\r
+\r
+ xvmaddadp vs34, vs0, vs18\r
+ xvmaddadp vs35, vs0, vs19\r
\r
- xvmaddadp vs34, vs0, vs18 // real*real, imag*real\r
- xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag \r
- \r
.endm\r
\r
-.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast \r
- KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
.endm\r
\r
-.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast \r
- KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
.endm\r
\r
-.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+\r
+ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
+ lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B\r
\r
- lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
\r
-lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
- lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B\r
- lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B\r
- lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B\r
+ xxswapd vs21, vs20\r
+ xxswapd vs23, vs22\r
+.if \IsLast==1\r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP2(\Index,16+\OffsetA)\r
+ addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) \r
+.endif\r
+.endif\r
\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag \r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17\r
\r
- xvmaddadp vs34, vs0, vs18 // real*real, imag*real\r
- xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag \r
+ xvmaddadp vs34, vs0, vs18\r
+ xvmaddadp vs35, vs0, vs19\r
\r
.if \Complete==0\r
- lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A \r
+ lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
\r
- lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B\r
- lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B\r
- lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B\r
- lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B\r
.endif\r
-\r
-.if \IsLast==1 \r
-.if \Complete==1\r
- addi \AREG, \AREG, DISP2(\Index,16+\OffsetA)\r
- addi \BREG, \BREG, DISP8(\Index,64+\OffsetB)\r
-.else \r
- addi \AREG, \AREG, DISP2(\Index,32)\r
- addi \BREG, \BREG, DISP8(\Index,128)\r
+.if \Complete==0\r
+ lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B\r
+ lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B\r
+.if \IsLast==1 \r
+ addi \AREG, \AREG, DISP2(\Index,32)\r
+ addi \BREG, \BREG, DISP4(\Index,64)\r
+.endif \r
.endif\r
-.endif \r
-\r
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag \r
\r
- xvmaddadp vs34, vs8, vs22 // real*real, imag*real\r
- xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag \r
- \r
+.if \Complete==0\r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+.endif\r
+\r
+ xvmaddadp vs32, vs8, vs20\r
+ xvmaddadp vs33, vs8, vs21\r
+\r
+ xvmaddadp vs34, vs8, vs22\r
+ xvmaddadp vs35, vs8, vs23\r
+\r
.endm\r
\r
-.macro KERNEL2x1 \r
+.macro KERNEL2x1\r
LOAD2x1 0\r
- END2x1 AO, BO, 16,64 \r
+ END2x1 AO, BO, 16,32\r
.endm\r
\r
.macro SAVE2x1\r
-\r
- mr T1, CO\r
-#ifndef TRMMKERNEL\r
- lxv vs16, 0(T1)\r
-#endif\r
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
-\r
-#ifndef TRMMKERNEL\r
- xvadddp vs8, vs8, vs16\r
-#endif\r
-\r
- stxv vs8, 0(T1)\r
-\r
- add T1, T1, LDC\r
-\r
-#ifndef TRMMKERNEL\r
- lxv vs16, 0(T1)\r
-#endif\r
-\r
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs8\r
-\r
-#ifndef TRMMKERNEL\r
- xvadddp vs8, vs8, vs16\r
-#endif\r
-\r
- stxv vs8, 0(T1)\r
-\r
- addi CO, CO, 16\r
-\r
+ add T1, CO ,LDC \r
+ SAVE1 vs32,vs33,CO,0\r
+ SAVE1 vs34,vs35,T1,0 \r
+ addi CO, CO, 16 \r
.endm\r
\r
/**********************************************************************************************\r
* Macros for N=1 and M=8\r
**********************************************************************************************/\r
.macro Zero1x8\r
- xxlxor vs32, vs32, vs32\r
- xxlxor vs33, vs33, vs33\r
- xxlxor vs34, vs34, vs34\r
- xxlxor vs35, vs35, vs35\r
- xxlxor vs36, vs36, vs36\r
- xxlxor vs37, vs37, vs37\r
- xxlxor vs38, vs38, vs38\r
- xxlxor vs39, vs39, vs39\r
- xxlxor vs40, vs40, vs40\r
- xxlxor vs41, vs41, vs41\r
- xxlxor vs42, vs42, vs42\r
- xxlxor vs43, vs43, vs43\r
- xxlxor vs44, vs44, vs44\r
- xxlxor vs45, vs45, vs45\r
- xxlxor vs46, vs46, vs46\r
- xxlxor vs47, vs47, vs47 \r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs38, vs38, vs38\r
+ xxlxor vs39, vs39, vs39\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs42, vs42, vs42\r
+ xxlxor vs43, vs43, vs43\r
+ xxlxor vs44, vs44, vs44\r
+ xxlxor vs45, vs45, vs45\r
+ xxlxor vs46, vs46, vs46\r
+ xxlxor vs47, vs47, vs47\r
.endm\r
\r
.macro LOAD1x8 Zero\r
\r
- lxv vs16, 0(BO) // load real part from B\r
- lxv vs17, 16(BO) // load imag part from B \r
-\r
- lxv vs0, 0(AO) // load real,imag from A\r
- lxv vs1, 16(AO) // load real,imag from A\r
- lxv vs2, 32(AO) // load real,imag from A\r
- lxv vs3, 48(AO) // load real,imag from A\r
+ lxv vs16, 0(BO) // load real imag from B\r
+ xxswapd vs17, vs16\r
+ lxv vs0, 0(AO) // load real,imag from A\r
+ lxv vs1, 16(AO) // load real,imag from A\r
+ lxv vs2, 32(AO) // load real,imag from A\r
+ lxv vs3, 48(AO) // load real,imag from A\r
\r
- lxv vs4, 64(AO) // load real,imag from A\r
- lxv vs5, 80(AO) // load real,imag from A\r
- lxv vs6, 96(AO) // load real,imag from A\r
- lxv vs7, 112(AO) // load real,imag from A\r
+ lxv vs4, 64(AO) // load real,imag from A\r
+ lxv vs5, 80(AO) // load real,imag from A\r
+ lxv vs6, 96(AO) // load real,imag from A\r
+ lxv vs7, 112(AO) // load real,imag from A\r
\r
.if \Zero==1\r
- Zero1x8 \r
+ Zero1x8\r
.endif\r
\r
.endm\r
\r
.macro END1x8_NORMAL\r
- END1x8 AO,BO,128,32\r
+ END1x8 AO,BO,128,16\r
.endm\r
\r
-.macro END1x8 AREG, BREG, OffsetA, OffsetB\r
+.macro END1x8 AREG, BREG, OffsetA, OffsetB\r
\r
-.if \OffsetB != 0 \r
- addi \BREG, \BREG, \OffsetB \r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
.endif\r
-.if \OffsetA != 0 \r
- addi \AREG, \AREG, \OffsetA \r
-.endif \r
-\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
- xvmaddadp vs40, vs4, vs16 // real*real, imag*real\r
- xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag\r
- xvmaddadp vs42, vs5, vs16 // real*real, imag*real\r
- xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag\r
- xvmaddadp vs44, vs6, vs16 // real*real, imag*real\r
- xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag\r
- xvmaddadp vs46, vs7, vs16 // real*real, imag*real\r
- xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag\r
-\r
-.endm\r
-\r
-.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast \r
- KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
-.endm\r
-\r
-.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast \r
- KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
-.endm\r
-\r
-.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
-\r
- lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A \r
-\r
- lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
- lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
- lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B \r
-\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
- xvmaddadp vs40, vs4, vs16 // real*real, imag*real\r
- xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag\r
- xvmaddadp vs42, vs5, vs16 // real*real, imag*real\r
- xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag\r
- xvmaddadp vs44, vs6, vs16 // real*real, imag*real\r
- xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag\r
- xvmaddadp vs46, vs7, vs16 // real*real, imag*real\r
- xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag\r
-\r
-.if \Complete==0\r
- lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
- lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
- lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B\r
- lxv vs17, DISP4(\Index,48+\OffsetB)(\BREG) // load imag part from B \r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
.endif\r
\r
-.if \IsLast==1 \r
-.if \Complete==1\r
- addi \AREG, \AREG, DISP16(\Index,128+\OffsetA)\r
- addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)\r
-.else \r
- addi \AREG, \AREG, DISP16(\Index,256)\r
- addi \BREG, \BREG, DISP4(\Index,64)\r
-.endif\r
-.endif \r
-\r
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag\r
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real\r
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag\r
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real\r
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag\r
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real\r
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag\r
- xvmaddadp vs40, vs12, vs20 // real*real, imag*real\r
- xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag\r
- xvmaddadp vs42, vs13, vs20 // real*real, imag*real\r
- xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag\r
- xvmaddadp vs44, vs14, vs20 // real*real, imag*real\r
- xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag\r
- xvmaddadp vs46, vs15, vs20 // real*real, imag*real\r
- xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag\r
-\r
-.endm\r
-\r
-.macro KERNEL1x8 \r
- LOAD1x8 0\r
- END1x8 AO, BO, 128,32 \r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17\r
+ xvmaddadp vs34, vs1, vs16\r
+ xvmaddadp vs35, vs1, vs17\r
+ xvmaddadp vs36, vs2, vs16\r
+ xvmaddadp vs37, vs2, vs17\r
+ xvmaddadp vs38, vs3, vs16\r
+ xvmaddadp vs39, vs3, vs17\r
+ xvmaddadp vs40, vs4, vs16\r
+ xvmaddadp vs41, vs4, vs17\r
+ xvmaddadp vs42, vs5, vs16\r
+ xvmaddadp vs43, vs5, vs17\r
+ xvmaddadp vs44, vs6, vs16\r
+ xvmaddadp vs45, vs6, vs17\r
+ xvmaddadp vs46, vs7, vs16\r
+ xvmaddadp vs47, vs7, vs17\r
+\r
.endm\r
\r
-.macro SAVE1x8\r
+.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.endm\r
\r
- mr T1, CO\r
- addi T2, T1, 64\r
+.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.endm\r
\r
-#ifndef TRMMKERNEL\r
+.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
\r
- lxv vs16, 0(T1)\r
- lxv vs17, 16(T1)\r
- lxv vs18, 32(T1)\r
- lxv vs19, 48(T1)\r
- lxv vs20, 0(T2)\r
- lxv vs21, 16(T2)\r
- lxv vs22, 32(T2)\r
- lxv vs23, 48(T2)\r
+ lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
+ xxswapd vs21, vs20\r
\r
-#endif\r
\r
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs9\r
- AGGREGATE_INTO_COMPLEX vs36,vs37,vs10\r
- AGGREGATE_INTO_COMPLEX vs38,vs39,vs11\r
- AGGREGATE_INTO_COMPLEX vs40,vs41,vs12\r
- AGGREGATE_INTO_COMPLEX vs42,vs43,vs13\r
- AGGREGATE_INTO_COMPLEX vs44,vs45,vs14\r
- AGGREGATE_INTO_COMPLEX vs46,vs47,vs15\r
-\r
-#ifndef TRMMKERNEL\r
-\r
- xvadddp vs8, vs8, vs16\r
- xvadddp vs9, vs9, vs17\r
- xvadddp vs10, vs10, vs18\r
- xvadddp vs11, vs11, vs19\r
- xvadddp vs12, vs12, vs20\r
- xvadddp vs13, vs13, vs21\r
- xvadddp vs14, vs14, vs22\r
- xvadddp vs15, vs15, vs23\r
+ lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17 \r
+ lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
+ xvmaddadp vs34, vs1, vs16\r
+ xvmaddadp vs35, vs1, vs17\r
+ lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
+ xvmaddadp vs36, vs2, vs16\r
+ xvmaddadp vs37, vs2, vs17\r
+ lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
\r
-#endif\r
+ xvmaddadp vs38, vs3, vs16\r
+ xvmaddadp vs39, vs3, vs17\r
+.if \Complete==0\r
+ lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
+ xvmaddadp vs40, vs4, vs16\r
+ xvmaddadp vs41, vs4, vs17\r
+.if \Complete==0 \r
+ lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
+ xvmaddadp vs42, vs5, vs16\r
+ xvmaddadp vs43, vs5, vs17\r
+ xvmaddadp vs44, vs6, vs16\r
+ xvmaddadp vs45, vs6, vs17\r
+.if \Complete==0\r
+ lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
+ xvmaddadp vs46, vs7, vs16\r
+ xvmaddadp vs47, vs7, vs17\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs20\r
+ xvmaddadp vs33, vs8, vs21\r
+.if \Complete==0 \r
+ lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A \r
+.endif\r
+ xvmaddadp vs34, vs9, vs20\r
+ xvmaddadp vs35, vs9, vs21\r
+.if \Complete==0\r
+ lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B\r
+ xxswapd vs17,vs16\r
+.endif\r
+.if \IsLast==1\r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP16(\Index,128+\OffsetA)\r
+ addi \BREG, \BREG, DISP2(\Index,16+\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP16(\Index,256)\r
+ addi \BREG, \BREG, DISP2(\Index,32)\r
+.endif\r
+.endif\r
+ xvmaddadp vs36, vs10, vs20\r
+ xvmaddadp vs37, vs10, vs21\r
+\r
+ xvmaddadp vs38, vs11, vs20\r
+ xvmaddadp vs39, vs11, vs21\r
+\r
+ xvmaddadp vs40, vs12, vs20\r
+ xvmaddadp vs41, vs12, vs21\r
+ xvmaddadp vs42, vs13, vs20\r
+ xvmaddadp vs43, vs13, vs21\r
+ xvmaddadp vs44, vs14, vs20\r
+ xvmaddadp vs45, vs14, vs21\r
+ xvmaddadp vs46, vs15, vs20\r
+ xvmaddadp vs47, vs15, vs21\r
+\r
+.endm\r
\r
- stxv vs8, 0(T1)\r
- stxv vs9, 16(T1)\r
- stxv vs10, 32(T1)\r
- stxv vs11, 48(T1)\r
- stxv vs12, 0(T2)\r
- stxv vs13, 16(T2)\r
- stxv vs14, 32(T2)\r
- stxv vs15, 48(T2)\r
+.macro KERNEL1x8\r
+ LOAD1x8 0\r
+ END1x8 AO, BO, 128,16\r
+.endm\r
+\r
+.macro SAVE1x8\r
\r
- addi CO, CO, 128\r
+ SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 \r
+ addi CO, CO, 128\r
\r
.endm\r
\r
**********************************************************************************************/\r
\r
.macro Zero1x4\r
- xxlxor vs32, vs32, vs32\r
- xxlxor vs33, vs33, vs33\r
- xxlxor vs34, vs34, vs34\r
- xxlxor vs35, vs35, vs35\r
- xxlxor vs36, vs36, vs36\r
- xxlxor vs37, vs37, vs37\r
- xxlxor vs38, vs38, vs38\r
- xxlxor vs39, vs39, vs39 \r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs38, vs38, vs38\r
+ xxlxor vs39, vs39, vs39\r
.endm\r
\r
.macro LOAD1x4 Zero\r
\r
- lxv vs16, 0(BO) // load real part from B\r
- lxv vs17, 16(BO) // load imag part from B \r
+ lxv vs16, 0(BO) // load real imag from B\r
+ xxswapd vs17,vs16\r
+ lxv vs0, 0(AO) // load real,imag from A\r
+ lxv vs1, 16(AO) // load real,imag from A\r
+ lxv vs2, 32(AO) // load real,imag from A\r
+ lxv vs3, 48(AO) // load real,imag from A\r
\r
- lxv vs0, 0(AO) // load real,imag from A\r
- lxv vs1, 16(AO) // load real,imag from A\r
- lxv vs2, 32(AO) // load real,imag from A\r
- lxv vs3, 48(AO) // load real,imag from A\r
- \r
.if \Zero==1\r
- Zero1x4 \r
+ Zero1x4\r
.endif\r
\r
.endm\r
\r
.macro END1x4_NORMAL\r
- END1x4 AO,BO,64,32\r
+ END1x4 AO,BO,64,16\r
.endm\r
\r
-.macro END1x4 AREG, BREG, OffsetA, OffsetB\r
+.macro END1x4 AREG, BREG, OffsetA, OffsetB\r
\r
-.if \OffsetB != 0 \r
- addi \BREG, \BREG, \OffsetB \r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
.endif\r
-.if \OffsetA != 0 \r
- addi \AREG, \AREG, \OffsetA \r
-.endif \r
\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17\r
+ xvmaddadp vs34, vs1, vs16\r
+ xvmaddadp vs35, vs1, vs17\r
+ xvmaddadp vs36, vs2, vs16\r
+ xvmaddadp vs37, vs2, vs17\r
+ xvmaddadp vs38, vs3, vs16\r
+ xvmaddadp vs39, vs3, vs17\r
\r
.endm\r
\r
-.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast \r
- KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
.endm\r
\r
-.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast \r
- KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
.endm\r
\r
-.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
\r
- lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
+ xxswapd vs21,vs20\r
\r
-lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
- lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B \r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17\r
\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
+ lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+ xvmaddadp vs34, vs1, vs16\r
+ xvmaddadp vs35, vs1, vs17 \r
+ lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
\r
- xvmaddadp vs40, vs0, vs18 // real*real, imag*real\r
- xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag\r
- xvmaddadp vs42, vs1, vs18 // real*real, imag*real\r
- xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag\r
- xvmaddadp vs44, vs2, vs18 // real*real, imag*real\r
- xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag\r
- xvmaddadp vs46, vs3, vs18 // real*real, imag*real\r
- xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs36, vs2, vs16\r
+ xvmaddadp vs37, vs2, vs17\r
+ xvmaddadp vs38, vs3, vs16\r
+ xvmaddadp vs39, vs3, vs17\r
\r
-.if \Complete==0\r
- lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A \r
+ xvmaddadp vs40, vs0, vs18\r
+ xvmaddadp vs41, vs0, vs19\r
+ xvmaddadp vs42, vs1, vs18\r
+ xvmaddadp vs43, vs1, vs19\r
+ xvmaddadp vs44, vs2, vs18\r
+ xvmaddadp vs45, vs2, vs19\r
+ xvmaddadp vs46, vs3, vs18\r
+ xvmaddadp vs47, vs3, vs19\r
\r
- lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B\r
- lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B \r
+.if \Complete==0\r
+ lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
.endif\r
-\r
-.if \IsLast==1 \r
+.if \Complete==0 \r
+ lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
+ \r
+.endif\r
+.if \Complete==0\r
+ lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B\r
+ xxswapd vs17,vs16\r
+.endif\r
+.if \IsLast==1\r
.if \Complete==1\r
- addi \AREG, \AREG, DISP8(\Index,64+\OffsetA)\r
- addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)\r
-.else \r
- addi \AREG, \AREG, DISP8(\Index,128)\r
- addi \BREG, \BREG, DISP4(\Index,64)\r
+ addi \AREG, \AREG, DISP8(\Index,64+\OffsetA)\r
+ addi \BREG, \BREG, DISP2(\Index,16+\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP8(\Index,128)\r
+ addi \BREG, \BREG, DISP2(\Index,32)\r
.endif\r
-.endif \r
-\r
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag\r
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real\r
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag\r
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real\r
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag\r
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real\r
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag\r
- \r
- xvmaddadp vs40, vs8, vs22 // real*real, imag*real\r
- xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag\r
- xvmaddadp vs42, vs9, vs22 // real*real, imag*real\r
- xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag\r
- xvmaddadp vs44, vs10, vs22 // real*real, imag*real\r
- xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag\r
- xvmaddadp vs46, vs11, vs22 // real*real, imag*real\r
- xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag\r
+.endif\r
+\r
+ xvmaddadp vs32, vs8, vs20\r
+ xvmaddadp vs33, vs8, vs21\r
+ xvmaddadp vs34, vs9, vs20\r
+ xvmaddadp vs35, vs9, vs21\r
+ xvmaddadp vs36, vs10, vs20\r
+ xvmaddadp vs37, vs10, vs21\r
+ xvmaddadp vs38, vs11, vs20\r
+ xvmaddadp vs39, vs11, vs21\r
+\r
+ xvmaddadp vs40, vs8, vs22\r
+ xvmaddadp vs41, vs8, vs23\r
+ xvmaddadp vs42, vs9, vs22\r
+ xvmaddadp vs43, vs9, vs23\r
+ xvmaddadp vs44, vs10, vs22\r
+ xvmaddadp vs45, vs10, vs23\r
+ xvmaddadp vs46, vs11, vs22\r
+ xvmaddadp vs47, vs11, vs23\r
\r
.endm\r
\r
-.macro KERNEL1x4 \r
+.macro KERNEL1x4\r
LOAD1x4 0\r
- END1x4 AO, BO, 64,32 \r
+ END1x4 AO, BO, 64,16\r
.endm\r
\r
.macro SAVE1x4\r
-\r
- mr T1, CO\r
-\r
-#ifndef TRMMKERNEL\r
-\r
- lxv vs16, 0(T1)\r
- lxv vs17, 16(T1)\r
- lxv vs18, 32(T1)\r
- lxv vs19, 48(T1)\r
-\r
-#endif\r
-\r
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs9\r
- AGGREGATE_INTO_COMPLEX vs36,vs37,vs10\r
- AGGREGATE_INTO_COMPLEX vs38,vs39,vs11\r
-\r
-#ifndef TRMMKERNEL\r
-\r
- xvadddp vs8, vs8, vs16\r
- xvadddp vs9, vs9, vs17\r
- xvadddp vs10, vs10, vs18\r
- xvadddp vs11, vs11, vs19\r
-\r
-#endif\r
-\r
- stxv vs8, 0(T1)\r
- stxv vs9, 16(T1)\r
- stxv vs10, 32(T1)\r
- stxv vs11, 48(T1) \r
- \r
- addi CO, CO, 64\r
+ SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0\r
+ addi CO, CO, 64\r
\r
.endm\r
\r
**********************************************************************************************/\r
\r
.macro Zero1x2\r
- xxlxor vs32, vs32, vs32\r
- xxlxor vs33, vs33, vs33\r
- xxlxor vs34, vs34, vs34\r
- xxlxor vs35, vs35, vs35 \r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
.endm\r
\r
.macro LOAD1x2 Zero\r
\r
- lxv vs16, 0(BO) // load real part from B\r
- lxv vs17, 16(BO) // load imag part from B \r
-\r
- lxv vs0, 0(AO) // load real,imag from A\r
- lxv vs1, 16(AO) // load real,imag from A \r
+ lxv vs16, 0(BO) // load real imag from B\r
+ xxswapd vs17,vs16\r
+ lxv vs0, 0(AO) // load real,imag from A\r
+ lxv vs1, 16(AO) // load real,imag from A\r
\r
.if \Zero==1\r
- Zero1x2 \r
+ Zero1x2\r
.endif\r
\r
.endm\r
\r
.macro END1x2_NORMAL\r
- END1x2 AO,BO,32,32\r
+ END1x2 AO,BO,32,16\r
.endm\r
\r
-.macro END1x2 AREG, BREG, OffsetA, OffsetB\r
+.macro END1x2 AREG, BREG, OffsetA, OffsetB\r
\r
-.if \OffsetB != 0 \r
- addi \BREG, \BREG, \OffsetB \r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
.endif\r
-.if \OffsetA != 0 \r
- addi \AREG, \AREG, \OffsetA \r
-.endif \r
\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
- \r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17\r
+ xvmaddadp vs34, vs1, vs16\r
+ xvmaddadp vs35, vs1, vs17\r
+\r
.endm\r
\r
-.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast \r
- KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
.endm\r
\r
-.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast \r
- KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
.endm\r
\r
-.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
\r
- lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
+ xxswapd vs21,vs20\r
\r
-lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
- lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B \r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17\r
\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag \r
-.if \Complete==0\r
- lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A \r
+ lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
\r
- lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B\r
- lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B \r
+ xvmaddadp vs34, vs1, vs16\r
+ xvmaddadp vs35, vs1, vs17\r
+.if \Complete==0\r
+ lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A \r
.endif\r
-\r
-.if \IsLast==1 \r
+.if \Complete==0\r
+ lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B\r
+ xxswapd vs17,vs16\r
+.endif\r
+.if \IsLast==1\r
.if \Complete==1\r
- addi \AREG, \AREG, DISP4(\Index,32+\OffsetA)\r
- addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)\r
-.else \r
- addi \AREG, \AREG, DISP4(\Index,64)\r
- addi \BREG, \BREG, DISP4(\Index,64)\r
+ addi \AREG, \AREG, DISP4(\Index,32+\OffsetA)\r
+ addi \BREG, \BREG, DISP2(\Index,16+\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP4(\Index,64)\r
+ addi \BREG, \BREG, DISP2(\Index,32)\r
+.endif\r
.endif\r
-.endif \r
\r
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag\r
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real\r
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs32, vs8, vs20\r
+ xvmaddadp vs33, vs8, vs21\r
+ xvmaddadp vs34, vs9, vs20\r
+ xvmaddadp vs35, vs9, vs21\r
\r
.endm\r
\r
-.macro KERNEL1x2 \r
+.macro KERNEL1x2\r
LOAD1x2 0\r
- END1x2 AO, BO, 32,32 \r
+ END1x2 AO, BO, 32,16\r
.endm\r
\r
.macro SAVE1x2\r
-\r
- mr T1, CO\r
-\r
-#ifndef TRMMKERNEL\r
-\r
- lxv vs16, 0(T1)\r
- lxv vs17, 16(T1)\r
-\r
-#endif\r
-\r
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 \r
-\r
-#ifndef TRMMKERNEL\r
-\r
- xvadddp vs8, vs8, vs16\r
- xvadddp vs9, vs9, vs17\r
-\r
-#endif\r
-\r
- stxv vs8, 0(T1)\r
- stxv vs9, 16(T1)\r
-\r
-addi CO, CO, 32\r
-\r
+ SAVE2 vs32,vs33,vs34,vs35,CO,0\r
+ addi CO, CO, 32 \r
.endm\r
\r
/**********************************************************************************************\r
**********************************************************************************************/\r
\r
.macro Zero1x1\r
- xxlxor vs32, vs32, vs32\r
- xxlxor vs33, vs33, vs33 \r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
.endm\r
\r
.macro LOAD1x1 Zero\r
- lxv vs0, 0(AO) // load real,imag from A \r
-\r
- lxv vs16, 0(BO) // load real part from B\r
- lxv vs17, 16(BO) // load imag part from B \r
+ lxv vs0, 0(AO) // load real,imag from A\r
\r
+ lxv vs16, 0(BO) // load real imag from B\r
+ xxswapd vs17, vs16\r
.if \Zero==1\r
- Zero1x1 \r
+ Zero1x1\r
.endif\r
-\r
+ \r
.endm\r
\r
.macro END1x1_NORMAL\r
- END1x1 AO,BO,16,32\r
+ END1x1 AO,BO,16,16\r
.endm\r
\r
-.macro END1x1 AREG, BREG, OffsetA, OffsetB\r
+.macro END1x1 AREG, BREG, OffsetA, OffsetB\r
\r
-.if \OffsetA != 0 \r
- addi \AREG, \AREG, \OffsetA \r
-.endif \r
-.if \OffsetB != 0 \r
- addi \BREG, \BREG, \OffsetB \r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
.endif\r
\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag \r
- \r
- \r
-.endm\r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17\r
\r
-.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast \r
- KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
-.endm\r
\r
-.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast \r
- KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
.endm\r
\r
-.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.endm\r
\r
- lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.endm\r
\r
- lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
- lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B \r
+.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
+ xxswapd vs21, vs20\r
\r
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag \r
+ lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ \r
+ xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs33, vs0, vs17\r
\r
.if \Complete==0\r
- lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A \r
-\r
- lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B\r
- lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B \r
+ lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif\r
+.if \Complete==0\r
+ lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B\r
+ xxswapd vs17, vs16 \r
.endif\r
\r
-\r
-.if \IsLast==1 \r
+.if \IsLast==1\r
.if \Complete==1\r
- addi \AREG, \AREG, DISP2(\Index,16+\OffsetA)\r
- addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)\r
-.else \r
- addi \AREG, \AREG, DISP2(\Index,32)\r
- addi \BREG, \BREG, DISP4(\Index,64)\r
+ addi \AREG, \AREG, DISP2(\Index,16+\OffsetA)\r
+ addi \BREG, \BREG, DISP2(\Index,16+\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP2(\Index,32)\r
+ addi \BREG, \BREG, DISP2(\Index,32)\r
.endif\r
.endif\r
- \r
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag \r
- \r
- \r
-.endm\r
-\r
-.macro KERNEL1x1 \r
- LOAD1x1 0\r
- END1x1 AO, BO, 16,32 \r
-\r
-.endm \r
\r
-.macro SAVE1x1\r
+ xvmaddadp vs32, vs8, vs20\r
+ xvmaddadp vs33, vs8, vs21\r
\r
- mr T1, CO\r
-#ifndef TRMMKERNEL\r
- lxv vs16, 0(T1)\r
-#endif\r
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
-\r
-#ifndef TRMMKERNEL\r
- xvadddp vs8, vs8, vs16\r
-#endif\r
-\r
- stxv vs8, 0(T1)\r
-\r
-addi CO, CO, 16\r
-\r
-.endm\r
-\r
-\r
-.macro ZCOPYB_2\r
-\r
- lxv vs32, 0(BO)\r
- lxv vs33, 16(BO) \r
- addi BO, BO, 32\r
- xxspltd vs40, vs32, 1\r
- xxspltd vs41, vs32, 0 \r
- xxspltd vs42, vs33, 1\r
- xxspltd vs43, vs33, 0\r
-\r
- stxv vs40, 0(BBO)\r
- stxv vs41, 16(BBO)\r
- stxv vs42, 32(BBO)\r
- stxv vs43, 48(BBO)\r
- addi BBO, BBO, 64\r
\r
.endm\r
\r
-.macro ZCOPYB_1\r
-\r
- lxv vs32, 0(BO) \r
- addi BO, BO, 16\r
- xxspltd vs40, vs32, 1\r
- xxspltd vs41, vs32, 0 \r
- stxv vs40, 0(BBO)\r
- stxv vs41, 16(BBO)\r
-\r
- addi BBO, BBO, 32\r
+.macro KERNEL1x1\r
+ LOAD1x1 0\r
+ END1x1 AO, BO, 16,16\r
\r
.endm\r
\r
-.macro ZCOPYB_8\r
-\r
- lxv vs32, 0(BO)\r
- lxv vs33, 16(BO)\r
- lxv vs34, 32(BO)\r
- lxv vs35, 48(BO) \r
-\r
- lxv vs36, 64+0(BO)\r
- lxv vs37, 64+16(BO)\r
- lxv vs38, 64+32(BO)\r
- lxv vs39, 64+48(BO) \r
- addi BO, BO, 128\r
- xxspltd vs40, vs32, 1\r
- xxspltd vs41, vs32, 0\r
- xxspltd vs42, vs33, 1\r
- xxspltd vs43, vs33, 0\r
- xxspltd vs44, vs34, 1\r
- xxspltd vs45, vs34, 0\r
- xxspltd vs46, vs35, 1\r
- xxspltd vs47, vs35, 0 \r
-\r
- xxspltd vs48, vs36, 1\r
- xxspltd vs49, vs36, 0\r
- xxspltd vs50, vs37, 1\r
- xxspltd vs51, vs37, 0\r
- xxspltd vs52, vs38, 1\r
- xxspltd vs53, vs38, 0\r
- xxspltd vs54, vs39, 1\r
- xxspltd vs55, vs39, 0\r
-\r
- stxv vs40, 0(BBO)\r
- stxv vs41, 16(BBO)\r
- stxv vs42, 32(BBO)\r
- stxv vs43, 48(BBO) \r
-\r
- stxv vs44, 64+0(BBO)\r
- stxv vs45, 64+16(BBO)\r
- stxv vs46, 64+32(BBO)\r
- stxv vs47, 64+48(BBO) \r
-\r
- stxv vs48, 128+ 0(BBO)\r
- stxv vs49, 128+ 16(BBO)\r
- stxv vs50, 128+ 32(BBO)\r
- stxv vs51, 128+ 48(BBO) \r
-\r
- stxv vs52, 192 + 0(BBO)\r
- stxv vs53, 192 + 16(BBO)\r
- stxv vs54, 192+ 32(BBO)\r
- stxv vs55, 192 + 48(BBO)\r
- addi BBO, BBO, 256\r
-\r
+.macro SAVE1x1 \r
+ SAVE1 vs32,vs33,CO,0\r
+ addi CO, CO, 16 \r
.endm\r
\r