1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
34 FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
36 FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07;
37 FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14;
38 FLOAT ctemp15, ctemp16;
39 v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
40 v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
41 v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
47 for (j = (n >> 3); j--;)
59 for (i = (m >> 2); i--;)
61 LD_SP2_INC(psrc1, 4, src0, src1);
62 LD_SP2_INC(psrc2, 4, src2, src3);
63 LD_SP2_INC(psrc3, 4, src4, src5);
64 LD_SP2_INC(psrc4, 4, src6, src7);
65 LD_SP2_INC(psrc5, 4, src8, src9);
66 LD_SP2_INC(psrc6, 4, src10, src11);
67 LD_SP2_INC(psrc7, 4, src12, src13);
68 LD_SP2_INC(psrc8, 4, src14, src15);
70 ILVRL_D2_SP(src2, src0, dst0, dst4);
71 ILVRL_D2_SP(src6, src4, dst1, dst5);
72 ILVRL_D2_SP(src10, src8, dst2, dst6);
73 ILVRL_D2_SP(src14, src12, dst3, dst7);
75 ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
77 ILVRL_D2_SP(src3, src1, dst0, dst4);
78 ILVRL_D2_SP(src7, src5, dst1, dst5);
79 ILVRL_D2_SP(src11, src9, dst2, dst6);
80 ILVRL_D2_SP(src15, src13, dst3, dst7);
82 ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
104 ILVRL_D2_SP(src2, src0, dst0, dst4);
105 ILVRL_D2_SP(src6, src4, dst1, dst5);
106 ILVRL_D2_SP(src10, src8, dst2, dst6);
107 ILVRL_D2_SP(src14, src12, dst3, dst7);
109 ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
114 ctemp01 = *(psrc1 + 0);
115 ctemp02 = *(psrc1 + 1);
116 ctemp03 = *(psrc2 + 0);
117 ctemp04 = *(psrc2 + 1);
118 ctemp05 = *(psrc3 + 0);
119 ctemp06 = *(psrc3 + 1);
120 ctemp07 = *(psrc4 + 0);
121 ctemp08 = *(psrc4 + 1);
122 ctemp09 = *(psrc5 + 0);
123 ctemp10 = *(psrc5 + 1);
124 ctemp11 = *(psrc6 + 0);
125 ctemp12 = *(psrc6 + 1);
126 ctemp13 = *(psrc7 + 0);
127 ctemp14 = *(psrc7 + 1);
128 ctemp15 = *(psrc8 + 0);
129 ctemp16 = *(psrc8 + 1);
139 *(pdst + 0) = ctemp01;
140 *(pdst + 1) = ctemp02;
141 *(pdst + 2) = ctemp03;
142 *(pdst + 3) = ctemp04;
143 *(pdst + 4) = ctemp05;
144 *(pdst + 5) = ctemp06;
145 *(pdst + 6) = ctemp07;
146 *(pdst + 7) = ctemp08;
147 *(pdst + 8) = ctemp09;
148 *(pdst + 9) = ctemp10;
149 *(pdst + 10) = ctemp11;
150 *(pdst + 11) = ctemp12;
151 *(pdst + 12) = ctemp13;
152 *(pdst + 13) = ctemp14;
153 *(pdst + 14) = ctemp15;
154 *(pdst + 15) = ctemp16;
167 for (i = (m >> 2); i--;)
169 LD_SP2_INC(psrc1, 4, src0, src1);
170 LD_SP2_INC(psrc2, 4, src2, src3);
171 LD_SP2_INC(psrc3, 4, src4, src5);
172 LD_SP2_INC(psrc4, 4, src6, src7);
174 ILVRL_D2_SP(src2, src0, dst0, dst4);
175 ILVRL_D2_SP(src6, src4, dst1, dst5);
177 ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
179 ILVRL_D2_SP(src3, src1, dst0, dst4);
180 ILVRL_D2_SP(src7, src5, dst1, dst5);
182 ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
196 ILVRL_D2_SP(src2, src0, dst0, dst4);
197 ILVRL_D2_SP(src6, src4, dst1, dst5);
199 ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
204 ctemp01 = *(psrc1 + 0);
205 ctemp02 = *(psrc1 + 1);
206 ctemp03 = *(psrc2 + 0);
207 ctemp04 = *(psrc2 + 1);
208 ctemp05 = *(psrc3 + 0);
209 ctemp06 = *(psrc3 + 1);
210 ctemp07 = *(psrc4 + 0);
211 ctemp08 = *(psrc4 + 1);
217 *(pdst + 0) = ctemp01;
218 *(pdst + 1) = ctemp02;
219 *(pdst + 2) = ctemp03;
220 *(pdst + 3) = ctemp04;
221 *(pdst + 4) = ctemp05;
222 *(pdst + 5) = ctemp06;
223 *(pdst + 6) = ctemp07;
224 *(pdst + 7) = ctemp08;
235 for (i = (m >> 2); i--;)
237 LD_SP2_INC(psrc1, 4, src0, src1);
238 LD_SP2_INC(psrc2, 4, src2, src3);
240 ILVRL_D2_SP(src2, src0, dst0, dst4);
242 ST_SP2_INC(dst0, dst4, pdst, 4);
244 ILVRL_D2_SP(src3, src1, dst0, dst4);
246 ST_SP2_INC(dst0, dst4, pdst, 4);
256 ILVRL_D2_SP(src2, src0, dst0, dst4);
258 ST_SP2_INC(dst0, dst4, pdst, 4);
263 ctemp01 = *(psrc1 + 0);
264 ctemp02 = *(psrc1 + 1);
265 ctemp03 = *(psrc2 + 0);
266 ctemp04 = *(psrc2 + 1);
270 *(pdst + 0) = ctemp01;
271 *(pdst + 1) = ctemp02;
272 *(pdst + 2) = ctemp03;
273 *(pdst + 3) = ctemp04;
282 for (i = (m >> 2); i--;)
284 LD_SP2_INC(psrc1, 4, src0, src1);
285 ST_SP2_INC(src0, src1, pdst, 4);
299 ctemp01 = *(psrc1 + 0);
300 ctemp02 = *(psrc1 + 1);
303 *(pdst + 0) = ctemp01;
304 *(pdst + 1) = ctemp02;