From: wernsaar Date: Thu, 7 Nov 2013 16:18:56 +0000 (+0100) Subject: added optimized blas level1 copy kernels X-Git-Tag: v0.2.9.rc1~7^2~6^2~56 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8fa93be06ec13bccb1d2e710615bb42e3b694c92;p=platform%2Fupstream%2Fopenblas.git added optimized blas level1 copy kernels --- diff --git a/kernel/arm/ccopy_vfpv3.S b/kernel/arm/ccopy_vfpv3.S new file mode 100644 index 0000000..aaba782 --- /dev/null +++ b/kernel/arm/ccopy_vfpv3.S @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + fldmias X!, { s0 - s7 } + fstmias Y!, { s0 - s7 } + +.endm + +.macro COPY_F1 + + fldmias X!, { s0 - s1 } + fstmias Y!, { s0 - s1 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s2 - s3 } + fstmias Y, { s2 - s3 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s2 - s3 } + fstmias Y, { s2 - s3 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble ccopy_kernel_L999 + + cmp INC_X, #0 + beq ccopy_kernel_L999 + + cmp INC_Y, #0 + beq ccopy_kernel_L999 + + cmp INC_X, #1 + bne ccopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne ccopy_kernel_S_BEGIN + +ccopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble ccopy_kernel_F1 + +ccopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne ccopy_kernel_F4 + +ccopy_kernel_F1: + + ands I, N, #3 + ble ccopy_kernel_L999 + +ccopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne ccopy_kernel_F10 + + b ccopy_kernel_L999 + +ccopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble ccopy_kernel_S1 + +ccopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne ccopy_kernel_S4 + +ccopy_kernel_S1: + + ands I, N, #3 + ble ccopy_kernel_L999 + +ccopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne ccopy_kernel_S10 + + + + + + +ccopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dcopy_vfpv3.S b/kernel/arm/dcopy_vfpv3.S new file mode 100644 index 0000000..0fad3c4 --- /dev/null +++ b/kernel/arm/dcopy_vfpv3.S @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d0 - d3 } + fstmiad Y!, { d0 - d3 } + +.endm + +.macro COPY_F1 + + fldmiad X!, { d0 } + fstmiad Y!, { d0 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d1 } + fstmiad Y, { d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d1 } + fstmiad Y, { d1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble dcopy_kernel_L999 + + cmp INC_X, #0 + beq dcopy_kernel_L999 + + cmp INC_Y, #0 + beq dcopy_kernel_L999 + + cmp INC_X, #1 + bne dcopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne dcopy_kernel_S_BEGIN + +dcopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble dcopy_kernel_F1 + +dcopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne dcopy_kernel_F4 + +dcopy_kernel_F1: + + ands I, N, #3 + ble dcopy_kernel_L999 + +dcopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne dcopy_kernel_F10 + + b dcopy_kernel_L999 + +dcopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble dcopy_kernel_S1 + +dcopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne dcopy_kernel_S4 + +dcopy_kernel_S1: + + ands I, N, #3 + ble dcopy_kernel_L999 + +dcopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne dcopy_kernel_S10 + + + + + + +dcopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/scopy_vfpv3.S b/kernel/arm/scopy_vfpv3.S new file mode 100644 index 0000000..e6ceaf2 --- /dev/null +++ b/kernel/arm/scopy_vfpv3.S @@ -0,0 +1,224 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F8 + + pld [ X, #X_PRE ] + fldmias X!, { s0 - s3 } + fldmias X!, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias Y!, { s4 - s7 } + +.endm + +.macro COPY_F1 + + fldmias X!, { s0 } + fstmias Y!, { s0 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s1 } + fstmias Y, { s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s1 } + fstmias Y, { s1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble scopy_kernel_L999 + + cmp INC_X, #0 + beq scopy_kernel_L999 + + cmp INC_Y, #0 + beq scopy_kernel_L999 + + cmp INC_X, #1 + bne scopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne scopy_kernel_S_BEGIN + +scopy_kernel_F_BEGIN: + + asrs I, N, #3 // I = N / 8 + ble scopy_kernel_F1 + +scopy_kernel_F8: + + COPY_F8 + + subs I, I, #1 + bne scopy_kernel_F8 + +scopy_kernel_F1: + + ands I, N, #7 + ble scopy_kernel_L999 + +scopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne scopy_kernel_F10 + + b scopy_kernel_L999 + +scopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble scopy_kernel_S1 + +scopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne scopy_kernel_S4 + +scopy_kernel_S1: + + ands I, N, #3 + ble scopy_kernel_L999 + +scopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne scopy_kernel_S10 + + + + + + +scopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zcopy_vfpv3.S b/kernel/arm/zcopy_vfpv3.S new file mode 100644 index 0000000..06f8924 --- /dev/null +++ b/kernel/arm/zcopy_vfpv3.S @@ -0,0 +1,223 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + pld [ X, #X_PRE+32 ] + fldmiad X!, { d0 - d7 } + fstmiad Y!, { d0 - d7 } + +.endm + +.macro COPY_F1 + + fldmiad X!, { d0 - d1 } + fstmiad Y!, { d0 - d1 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d2 - d3 } + fstmiad Y, { d2 - d3 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d2 - d3 } + fstmiad Y, { d2 - d3 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble zcopy_kernel_L999 + + cmp INC_X, #0 + beq zcopy_kernel_L999 + + cmp INC_Y, #0 + beq zcopy_kernel_L999 + + cmp INC_X, #1 + bne zcopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne zcopy_kernel_S_BEGIN + +zcopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble zcopy_kernel_F1 + +zcopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne zcopy_kernel_F4 + +zcopy_kernel_F1: + + ands I, N, #3 + ble zcopy_kernel_L999 + +zcopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne zcopy_kernel_F10 + + b zcopy_kernel_L999 + +zcopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble zcopy_kernel_S1 + +zcopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne zcopy_kernel_S4 + +zcopy_kernel_S1: + + ands I, N, #3 + ble zcopy_kernel_L999 + +zcopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne zcopy_kernel_S10 + + + + + + +zcopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE +