LEAQ (ptrba, %rax, 8), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
PREFETCH2 0*SIZE(prebb);
XOR_DY yvec15, yvec15, yvec15;
PREFETCH2 8*SIZE(prebb);
.L2_bodyB:;
# Computing kernel
-#### Unroll times 1 ####
+//#### Unroll times 1 ####
LD_DY 4*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
SHUF_DY $0x03, yvec2, yvec2, yvec4;
ADD_DY yvec10, yvec6, yvec10;
ADD_DY yvec8, yvec7, yvec8;
-#### Unroll times 2 ####
+//#### Unroll times 2 ####
LD_DY 12*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
SHUF_DY $0x03, yvec2, yvec2, yvec4;
ADD_DY yvec10, yvec6, yvec10;
ADD_DY yvec8, yvec7, yvec8;
-#### Unroll times 3 ####
+//#### Unroll times 3 ####
LD_DY 20*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
SHUF_DY $0x03, yvec2, yvec2, yvec4;
ADD_DY yvec10, yvec6, yvec10;
ADD_DY yvec8, yvec7, yvec8;
-#### Unroll times 4 ####
+//#### Unroll times 4 ####
LD_DY 28*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
SHUF_DY $0x03, yvec2, yvec2, yvec4;
JLE .L3_loopE;
ALIGN_5
.L3_bodyB:
-#### Unroll times 1 ####
+//#### Unroll times 1 ####
PREFETCH0 64*SIZE(ptrba)
LD_DY 4*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
ADD_DY yvec10, yvec6, yvec10;
ADD_DY yvec8, yvec7, yvec8;
-#### Unroll times 2 ####
+//#### Unroll times 2 ####
PREFETCH0 72*SIZE(ptrba)
LD_DY 12*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
JLE .L4_loopE;
ALIGN_5
.L4_bodyB:;
-#### Unroll times 1 ####
+//#### Unroll times 1 ####
PREFETCH0 64*SIZE(ptrba)
LD_DY 4*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
ADD_DY yvec8, yvec7, yvec8;
.L4_loopE:;
-#### Load Alpha ####
+//#### Load Alpha ####
BROAD_DY MEMALPHA,yvec7;
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
MUL_DY yvec7,yvec15,yvec15;
MUL_DY yvec7,yvec14,yvec14;
MUL_DY yvec7,yvec13,yvec13;
MUL_DY yvec7,yvec10,yvec10;
MUL_DY yvec7,yvec9,yvec9;
MUL_DY yvec7,yvec8,yvec8;
-#### Reverse the Results ####
+//#### Reverse the Results ####
MOV_DY yvec15,yvec7;
REVS_DY $0x0a,yvec13,yvec15,yvec15;
REVS_DY $0x0a,yvec7,yvec13,yvec13;
MOV_DY yvec10,yvec7;
REVS_DY $0x0a,yvec8,yvec10,yvec10;
REVS_DY $0x0a,yvec7,yvec8,yvec8;
-#### Testing alignment ####
+//#### Testing alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L4_loopEx; # Unalign part write back
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
EXTRA_DY $1,yvec15,xvec7;
EXTRA_DY $1,yvec14,xvec6;
EXTRA_DY $1,yvec13,xvec5;
LEAQ (ptrba, %rax, 4), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
XOR_DY yvec15, yvec15, yvec15;
XOR_DY yvec13, yvec13, yvec13;
LD_DY 0*SIZE(ptrbb), yvec2;
.L6_bodyB:;
# Computing kernel
-#### Untoll time 1 ####
+//#### Untoll time 1 ####
LD_DY 4*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
MUL_DY yvec0, yvec5, yvec7;
ADD_DY yvec9, yvec7, yvec9;
-#### Untoll time 2 ####
+//#### Untoll time 2 ####
LD_DY 8*SIZE(ptrba), yvec0;
MUL_DY yvec1, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
MUL_DY yvec1, yvec5, yvec7;
ADD_DY yvec9, yvec7, yvec9;
-#### Untoll time 3 ####
+//#### Untoll time 3 ####
LD_DY 12*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
MUL_DY yvec0, yvec5, yvec7;
ADD_DY yvec9, yvec7, yvec9;
-#### Untoll time 4 ####
+//#### Untoll time 4 ####
LD_DY 0*SIZE(ptrba), yvec0;
MUL_DY yvec1, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
JLE .L7_loopE;
ALIGN_5
.L7_bodyB:;
-#### Untoll time 1 ####
+//#### Untoll time 1 ####
LD_DY 4*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
MUL_DY yvec0, yvec5, yvec7;
ADD_DY yvec9, yvec7, yvec9;
-#### Untoll time 2 ####
+//#### Untoll time 2 ####
LD_DY 0*SIZE(ptrba), yvec0;
MUL_DY yvec1, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
JLE .L8_loopE;
ALIGN_5
.L8_bodyB:;
-#### Untoll time 1 ####
+//#### Untoll time 1 ####
MUL_DY yvec0, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
SHUF_DY $0x03, yvec2, yvec2, yvec4;
ADD_DY yvec9, yvec7, yvec9;
.L8_loopE:;
-#### Load Alpha ####
+//#### Load Alpha ####
BROAD_DY MEMALPHA, yvec7;
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
MUL_DY yvec7,yvec15,yvec15;
MUL_DY yvec7,yvec13,yvec13;
MUL_DY yvec7,yvec11,yvec11;
MUL_DY yvec7,yvec9,yvec9;
-#### Reverse the Results ####
+//#### Reverse the Results ####
MOV_DY yvec15, yvec7;
REVS_DY $0x0a,yvec13,yvec15,yvec15;
REVS_DY $0x0a,yvec7,yvec13,yvec13;
MOV_DY yvec11,yvec7;
REVS_DY $0x0a,yvec9,yvec11,yvec11;
REVS_DY $0x0a,yvec7,yvec9,yvec9;
-#### Testing alignment ####
+//#### Testing alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L8_loopEx; # Unalign part write back
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
EXTRA_DY $1,yvec15,xvec7;
EXTRA_DY $1,yvec13,xvec5;
EXTRA_DY $1,yvec11,xvec3;
LEAQ (ptrba, %rax, 2), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
LD_DX 0*SIZE(ptrbb), xvec2;
XOR_DY yvec15, yvec15, yvec15;
LD_DX 2*SIZE(ptrbb), xvec3;
.L10_bodyB:;
# Computing kernel
-##### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 4*SIZE(ptrbb), xvec6;
SHUF_DX $0x4e, xvec3, xvec5;
MUL_DX xvec0, xvec2, xvec2;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec9, xvec9;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 8*SIZE(ptrbb), xvec2;
SHUF_DX $0x4e, xvec7, xvec5;
MUL_DX xvec1, xvec6, xvec6;
MUL_DX xvec1, xvec5, xvec5;
ADD_DX xvec5, xvec9, xvec9;
-##### Unroll time 3 ####
+//#### Unroll time 3 ####
LD_DX 12*SIZE(ptrbb), xvec6;
SHUF_DX $0x4e, xvec3, xvec5;
MUL_DX xvec0, xvec2, xvec2;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec9, xvec9;
-#### Unroll time 4 ####
+//#### Unroll time 4 ####
LD_DX 0*SIZE(ptrbb), xvec2;
SHUF_DX $0x4e, xvec7, xvec5;
MUL_DX xvec1, xvec6, xvec6;
JLE .L11_loopE;
ALIGN_5
.L11_bodyB:;
-##### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 4*SIZE(ptrbb), xvec6;
SHUF_DX $0x4e, xvec3, xvec5;
MUL_DX xvec0, xvec2, xvec2;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec9, xvec9;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 0*SIZE(ptrbb), xvec2;
SHUF_DX $0x4e, xvec7, xvec5;
MUL_DX xvec1, xvec6, xvec6;
ADD_DX xvec5, xvec9, xvec9;
.L12_loopE:;
-#### Load Alpha ####
+//#### Load Alpha ####
BROAD_DX MEMALPHA, xvec7;
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
MUL_DX xvec7, xvec15, xvec15;
MUL_DX xvec7, xvec13, xvec13;
MUL_DX xvec7, xvec11, xvec11;
MUL_DX xvec7, xvec9, xvec9;
-#### Reverse the Results ####
+//#### Reverse the Results ####
MOV_DX xvec15, xvec6;
REVS_DX xvec13, xvec15, xvec15;
REVS_DX xvec6, xvec13, xvec13;
MOV_DX xvec11, xvec6;
REVS_DX xvec9, xvec11, xvec11;
REVS_DX xvec6, xvec9, xvec9;
-#### Testing Alignment ####
+//#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L12_loopEx;
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec13, xvec13;
ADD_DX 0*SIZE(C0, ldc, 1), xvec15, xvec15;
ADDQ %rax, ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
XOR_DY yvec15, yvec15, yvec15;
#ifndef TRMMKERNEL
MOVQ bk, k;
ADDQ $4*SIZE, ptrbb;
.L16_loopE:
-#### Load Alpha ####
+//#### Load Alpha ####
BROAD_DY MEMALPHA, yvec7;
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
MUL_DY yvec15, yvec7, yvec15;
-#### Writing Back ####
+//#### Writing Back ####
EXTRA_DY $1, yvec15, xvec7;
#ifndef TRMMKERNEL
LDL_DX 0*SIZE(C0), xvec0, xvec0;
LEAQ (ptrba, %rax, 8), ptrba;
LEAQ (ptrbb, %rax, 2), ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
XOR_DY yvec15, yvec15, yvec15;
XOR_DY yvec14, yvec14, yvec14;
XOR_DY yvec13, yvec13, yvec13;
ALIGN_5;
.L211_bodyB:
# Computing kernel
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
MUL_DX xvec3, xvec7, xvec7;
ADD_DX xvec7, xvec8, xvec8;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 8*SIZE(ptrba), xvec0;
LD_DX 2*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
MUL_DX xvec3, xvec7, xvec7;
ADD_DX xvec7, xvec8, xvec8;
-#### Unroll time 3 ####
+//#### Unroll time 3 ####
LD_DX 16*SIZE(ptrba), xvec0;
LD_DX 4*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
MUL_DX xvec3, xvec7, xvec7;
ADD_DX xvec7, xvec8, xvec8;
-#### Unroll time 4 ####
+//#### Unroll time 4 ####
LD_DX 24*SIZE(ptrba), xvec0;
LD_DX 6*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
ALIGN_5;
.L212_bodyB:
# Computing kernel
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
MUL_DX xvec3, xvec7, xvec7;
ADD_DX xvec7, xvec8, xvec8;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 8*SIZE(ptrba), xvec0;
LD_DX 2*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
JLE .L213_loopE;
ALIGN_5
.L213_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
ADD_DX xvec7, xvec8, xvec8;
.L213_loopE:
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
BROAD_DX MEMALPHA, xvec7;
MUL_DX xvec7, xvec15, xvec15;
MUL_DX xvec7, xvec14, xvec14;
MUL_DX xvec7, xvec10, xvec10;
MUL_DX xvec7, xvec9, xvec9;
MUL_DX xvec7, xvec8, xvec8;
-#### Reverse #####
+//#### Reverse ####
MOV_DX xvec15, xvec6;
REVS_DX xvec11, xvec15, xvec15;
REVS_DX xvec6, xvec11, xvec11;
MOV_DX xvec12, xvec6;
REVS_DX xvec8, xvec12, xvec12;
REVS_DX xvec6, xvec8, xvec8;
-#### Testing Alignment ####
+//#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L213_loopEx;
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec11, xvec11;
ADD_DX 2*SIZE(C0), xvec10, xvec10;
LEAQ (ptrba, %rax, 4), ptrba;
LEAQ (ptrbb, %rax, 2), ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
XOR_DY yvec15, yvec15, yvec15;
XOR_DY yvec14, yvec14, yvec14;
XOR_DY yvec11, yvec11, yvec11;
ALIGN_5
.L221_bodyB:;
# Computing kernel
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
MUL_DX xvec1, xvec5, xvec5;
ADD_DX xvec5, xvec10, xvec10;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 4*SIZE(ptrba), xvec0;
LD_DX 2*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
MUL_DX xvec1, xvec5, xvec5;
ADD_DX xvec5, xvec10, xvec10;
-#### Unroll time 3 ####
+//#### Unroll time 3 ####
LD_DX 8*SIZE(ptrba), xvec0;
LD_DX 4*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
MUL_DX xvec1, xvec5, xvec5;
ADD_DX xvec5, xvec10, xvec10;
-#### Unroll time 4 ####
+//#### Unroll time 4 ####
LD_DX 12*SIZE(ptrba), xvec0;
LD_DX 6*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
JLE .L222_loopE;
ALIGN_5
.L222_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
MUL_DX xvec1, xvec5, xvec5;
ADD_DX xvec5, xvec10, xvec10;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 4*SIZE(ptrba), xvec0;
LD_DX 2*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
JLE .L223_loopE;
ALIGN_5
.L223_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
ADD_DX xvec5, xvec10, xvec10;
.L223_loopE:
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
BROAD_DX MEMALPHA, xvec7;
MUL_DX xvec7, xvec15, xvec15;
MUL_DX xvec7, xvec14, xvec14;
MUL_DX xvec7, xvec11, xvec11;
MUL_DX xvec7, xvec10, xvec10;
-#### Reverse #####
+//#### Reverse ####
MOV_DX xvec15, xvec6;
REVS_DX xvec11, xvec15, xvec15;
REVS_DX xvec6, xvec11, xvec11;
MOV_DX xvec14, xvec6;
REVS_DX xvec10, xvec14, xvec14;
REVS_DX xvec6, xvec10, xvec10;
-#### Testing Alignment ####
+//#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L223_loopEx;
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec11, xvec11;
ADD_DX 2*SIZE(C0), xvec10, xvec10;
ADDQ $4*SIZE, C0;
ADDQ $4*SIZE, C1;
.L22_loopE:;
-TEST $2, bm; # Rm = 2
+TEST $2, bm; // Rm = 2
JLE .L23_loopE;
ALIGN_5;
.L23_bodyB:
ALIGN_5
.L231_bodyB:
# Computing kernel
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec11, xvec11;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 2*SIZE(ptrba), xvec0;
LD_DX 2*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec11, xvec11;
-#### Unroll time 3 ####
+//#### Unroll time 3 ####
LD_DX 4*SIZE(ptrba), xvec0;
LD_DX 4*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec11, xvec11;
-#### Unroll time 4 ####
+//#### Unroll time 4 ####
LD_DX 6*SIZE(ptrba), xvec0;
LD_DX 6*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
JLE .L232_loopE;
ALIGN_5
.L232_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec11, xvec11;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 2*SIZE(ptrba), xvec0;
LD_DX 2*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
JLE .L233_loopE;
ALIGN_5
.L233_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
ADD_DX xvec5, xvec11, xvec11;
ADDQ $2*SIZE, ptrbb;
.L233_loopE:
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
BROAD_DX MEMALPHA, xvec7;
MUL_DX xvec7, xvec15, xvec15;
MUL_DX xvec7, xvec11, xvec11;
-#### Reverse #####
+//#### Reverse ####
MOV_DX xvec15, xvec6;
REVS_DX xvec11, xvec15, xvec15;
REVS_DX xvec6, xvec11, xvec11;
-#### Testing Alignment ####
+//#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L233_loopEx;
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec11, xvec11;
ADD_DX 0*SIZE(C1), xvec15, xvec15;
ADDQ $2*SIZE, C0;
ADDQ $2*SIZE, C1;
.L23_loopE:
-TEST $1, bm; # Rm = 1
+TEST $1, bm; // Rm = 1
JLE .L24_loopE;
ALIGN_5;
.L24_bodyB:
ADDQ k, bb;
LEAQ (C, ldc, 2), C;
.L20_loopE:;
-TEST $1, bn; # Rn = 1
+TEST $1, bn; // Rn = 1
JLE .L30_loopE;
ALIGN_5
.L30_bodyB:
LEAQ (ptrba, %rax, 8), ptrba;
ADDQ %rax, ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
XOR_DY yvec15, yvec15, yvec15;
XOR_DY yvec14, yvec14, yvec14;
#ifndef TRMMKERNEL
JLE .L311_loopE;
ALIGN_5
.L311_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DY 0*SIZE(ptrba), yvec0;
LD_DY 4*SIZE(ptrba), yvec1;
BROAD_DY 0*SIZE(ptrbb), yvec2;
MUL_DY yvec2, yvec1, yvec1;
ADD_DY yvec1, yvec14, yvec14;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DY 8*SIZE(ptrba), yvec3;
LD_DY 12*SIZE(ptrba), yvec4;
BROAD_DY 1*SIZE(ptrbb), yvec5;
MUL_DY yvec5, yvec4, yvec4
ADD_DY yvec4, yvec14, yvec14;
-#### Unroll time 3 ####
+//#### Unroll time 3 ####
LD_DY 16*SIZE(ptrba), yvec0;
LD_DY 20*SIZE(ptrba), yvec1;
BROAD_DY 2*SIZE(ptrbb), yvec2;
MUL_DY yvec2, yvec1, yvec1;
ADD_DY yvec1, yvec14, yvec14;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DY 24*SIZE(ptrba), yvec3;
LD_DY 28*SIZE(ptrba), yvec4;
BROAD_DY 3*SIZE(ptrbb), yvec5;
JLE .L312_loopE;
ALIGN_5
.L312_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DY 0*SIZE(ptrba), yvec0;
LD_DY 4*SIZE(ptrba), yvec1;
BROAD_DY 0*SIZE(ptrbb), yvec2;
MUL_DY yvec2, yvec1, yvec1;
ADD_DY yvec1, yvec14, yvec14;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DY 8*SIZE(ptrba), yvec3;
LD_DY 12*SIZE(ptrba), yvec4;
BROAD_DY 1*SIZE(ptrbb), yvec5;
JLE .L313_loopE;
ALIGN_5
.L313_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DY 0*SIZE(ptrba), yvec0;
LD_DY 4*SIZE(ptrba), yvec1;
BROAD_DY 0*SIZE(ptrbb), yvec2;
ADDQ $1*SIZE, ptrbb;
.L313_loopE:
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
BROAD_DY MEMALPHA, yvec7;
MUL_DY yvec7, yvec15, yvec15;
MUL_DY yvec7, yvec14, yvec14;
-#### Testing Alignment ####
+//#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L313_loopEx;
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
EXTRA_DY $1, yvec15, xvec13;
EXTRA_DY $1, yvec14, xvec12;
#ifndef TRMMKERNEL
LEAQ (ptrba, %rax, 4), ptrba;
ADDQ %rax, ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
XOR_DY yvec15, yvec15, yvec15;
#ifndef TRMMKERNEL
MOVQ bk, k;
ADDQ $1*SIZE, ptrbb;
.L323_loopE:
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
BROAD_DY MEMALPHA, yvec7;
MUL_DY yvec7, yvec15, yvec15;
-#### Testing Alignment ####
+//#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L323_loopEx;
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
EXTRA_DY $1, yvec15, xvec14;
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec15, xvec15;
JMP .L32_loopE;
ALIGN_5
.L323_loopEx:
-#### Writing Back ####
+//#### Writing Back ####
EXTRA_DY $1, yvec15, xvec14;
#ifndef TRMMKERNEL
LDL_DX 0*SIZE(C0), xvec13, xvec13;
LEAQ (ptrba, %rax, 2), ptrba
ADDQ %rax, ptrbb;
#endif
-#### Initial Result ####
+//#### Initial Result ####
XOR_DY yvec15, yvec15, yvec15;
#ifndef TRMMKERNEL
MOVQ bk, k;
ADDQ $2*SIZE, ptrba;
ADDQ $1*SIZE, ptrbb;
.L333_loopE:
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
BROAD_DX MEMALPHA, xvec7;
MUL_DX xvec7, xvec15, xvec15;
#ifndef TRMMKERNEL
addq $1*SIZE, ptrbb;
.L343_loopE:
-#### Writing Back ####
+//#### Writing Back ####
vmovsd MEMALPHA, xvec7;
vmulsd xvec7, xvec15, xvec15;
#ifndef TRMMKERNEL