1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
43 OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
47 BLASLONG inc_x2, inc_y2;
48 FLOAT x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
49 v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7, vx8, vx9, vx10, vx11;
50 v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7, vy8, vy9, vy10, vy11;
51 v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
52 v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
53 v4f32 dot0 = {0, 0, 0, 0};
54 v4f32 dot1 = {0, 0, 0, 0};
55 v4f32 dot2 = {0, 0, 0, 0};
56 v4f32 dot3 = {0, 0, 0, 0};
57 v4f32 dot4 = {0, 0, 0, 0};
58 v4f32 dot5 = {0, 0, 0, 0};
59 v4f32 dot6 = {0, 0, 0, 0};
60 v4f32 dot7 = {0, 0, 0, 0};
61 OPENBLAS_COMPLEX_FLOAT result;
69 if (n < 1) return (result);
71 if ((1 == inc_x) && (1 == inc_y))
75 FLOAT *x_pref, *y_pref;
78 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
81 pref_offset = L1_DATA_LINESIZE - pref_offset;
82 pref_offset = pref_offset / sizeof(FLOAT);
84 x_pref = x + pref_offset + 64 + 16;
86 pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
89 pref_offset = L1_DATA_LINESIZE - pref_offset;
90 pref_offset = pref_offset / sizeof(FLOAT);
92 y_pref = y + pref_offset + 64 + 16;
94 LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
95 LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
97 PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
98 PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
100 for (i = (n >> 4) - 1; i--;)
102 PREF_OFFSET(x_pref, 0);
103 PREF_OFFSET(x_pref, 32);
104 PREF_OFFSET(x_pref, 64);
105 PREF_OFFSET(x_pref, 96);
106 PREF_OFFSET(y_pref, 0);
107 PREF_OFFSET(y_pref, 32);
108 PREF_OFFSET(y_pref, 64);
109 PREF_OFFSET(y_pref, 96);
113 vx4 = LD_SP(x); x += 4;
114 vx1r = (v4f32) __msa_pckev_w((v4i32) vx3, (v4i32) vx2);
115 dot0 += (vx0r * vy0r);
116 vx5 = LD_SP(x); x += 4;
117 vx1i = (v4f32) __msa_pckod_w((v4i32) vx3, (v4i32) vx2);
118 dot1 OP2 (vx0i * vy0r);
119 vy4 = LD_SP(y); y += 4;
120 vy1r = (v4f32) __msa_pckev_w((v4i32) vy3, (v4i32) vy2);
121 dot2 += (vx1r * vy1r);
122 vy5 = LD_SP(y); y += 4;
123 vy1i = (v4f32) __msa_pckod_w((v4i32) vy3, (v4i32) vy2);
124 dot3 OP2 (vx1i * vy1r);
125 vx6 = LD_SP(x); x += 4;
126 vx7 = LD_SP(x); x += 4;
127 vy6 = LD_SP(y); y += 4;
128 vy7 = LD_SP(y); y += 4;
129 vx8 = LD_SP(x); x += 4;
130 dot0 OP1 (vx0i * vy0i);
131 vx9 = LD_SP(x); x += 4;
132 vx2r = (v4f32) __msa_pckev_w((v4i32) vx5, (v4i32) vx4);
133 dot1 += (vx0r * vy0i);
134 vy8 = LD_SP(y); y += 4;
135 vx2i = (v4f32) __msa_pckod_w((v4i32) vx5, (v4i32) vx4);
136 dot2 OP1 (vx1i * vy1i);
137 vy9 = LD_SP(y); y += 4;
138 vy2r = (v4f32) __msa_pckev_w((v4i32) vy5, (v4i32) vy4);
139 dot3 += (vx1r * vy1i);
140 vx10 = LD_SP(x); x += 4;
141 vy2i = (v4f32) __msa_pckod_w((v4i32) vy5, (v4i32) vy4);
142 vx11 = LD_SP(x); x += 4;
143 vx3r = (v4f32) __msa_pckev_w((v4i32) vx7, (v4i32) vx6);
144 dot4 += (vx2r * vy2r);
145 vy10 = LD_SP(y); y += 4;
146 vx3i = (v4f32) __msa_pckod_w((v4i32) vx7, (v4i32) vx6);
147 dot5 OP2 (vx2i * vy2r);
148 vy11 = LD_SP(y); y += 4;
149 vy3r = (v4f32) __msa_pckev_w((v4i32) vy7, (v4i32) vy6);
150 vy3i = (v4f32) __msa_pckod_w((v4i32) vy7, (v4i32) vy6);
151 dot6 += (vx3r * vy3r);
152 vx0r = (v4f32) __msa_pckev_w((v4i32) vx9, (v4i32) vx8);
153 dot7 OP2 (vx3i * vy3r);
154 vx0i = (v4f32) __msa_pckod_w((v4i32) vx9, (v4i32) vx8);
155 vy0r = (v4f32) __msa_pckev_w((v4i32) vy9, (v4i32) vy8);
157 vy0i = (v4f32) __msa_pckod_w((v4i32) vy9, (v4i32) vy8);
159 dot4 OP1 (vx2i * vy2i);
161 dot5 += (vx2r * vy2i);
163 dot6 OP1 (vx3i * vy3i);
164 dot7 += (vx3r * vy3i);
167 vx4 = LD_SP(x); x += 4;
168 vx1r = (v4f32) __msa_pckev_w((v4i32) vx3, (v4i32) vx2);
169 dot0 += (vx0r * vy0r);
170 vx5 = LD_SP(x); x += 4;
171 vx1i = (v4f32) __msa_pckod_w((v4i32) vx3, (v4i32) vx2);
172 dot1 OP2 (vx0i * vy0r);
173 vy4 = LD_SP(y); y += 4;
174 vy1r = (v4f32) __msa_pckev_w((v4i32) vy3, (v4i32) vy2);
175 dot2 += (vx1r * vy1r);
176 vy5 = LD_SP(y); y += 4;
177 vy1i = (v4f32) __msa_pckod_w((v4i32) vy3, (v4i32) vy2);
178 dot3 OP2 (vx1i * vy1r);
179 vx6 = LD_SP(x); x += 4;
180 vx7 = LD_SP(x); x += 4;
181 vy6 = LD_SP(y); y += 4;
182 vy7 = LD_SP(y); y += 4;
183 dot0 OP1 (vx0i * vy0i);
184 vx2r = (v4f32) __msa_pckev_w((v4i32) vx5, (v4i32) vx4);
185 dot1 += (vx0r * vy0i);
186 vx2i = (v4f32) __msa_pckod_w((v4i32) vx5, (v4i32) vx4);
187 dot2 OP1 (vx1i * vy1i);
188 vy2r = (v4f32) __msa_pckev_w((v4i32) vy5, (v4i32) vy4);
189 dot3 += (vx1r * vy1i);
190 vy2i = (v4f32) __msa_pckod_w((v4i32) vy5, (v4i32) vy4);
191 vx3r = (v4f32) __msa_pckev_w((v4i32) vx7, (v4i32) vx6);
192 dot4 += (vx2r * vy2r);
193 vx3i = (v4f32) __msa_pckod_w((v4i32) vx7, (v4i32) vx6);
194 dot5 OP2 (vx2i * vy2r);
195 vy3r = (v4f32) __msa_pckev_w((v4i32) vy7, (v4i32) vy6);
196 vy3i = (v4f32) __msa_pckod_w((v4i32) vy7, (v4i32) vy6);
197 dot6 += (vx3r * vy3r);
198 dot7 OP2 (vx3i * vy3r);
199 dot4 OP1 (vx2i * vy2i);
200 dot5 += (vx2r * vy2i);
201 dot6 OP1 (vx3i * vy3i);
202 dot7 += (vx3r * vy3i);
209 LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
210 LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
212 PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
213 PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
215 PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
216 PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
218 dot0 += (vx0r * vy0r);
219 dot0 OP1 (vx0i * vy0i);
220 dot1 OP2 (vx0i * vy0r);
221 dot1 += (vx0r * vy0i);
223 dot2 += (vx1r * vy1r);
224 dot2 OP1 (vx1i * vy1i);
225 dot3 OP2 (vx1i * vy1r);
226 dot3 += (vx1r * vy1i);
231 LD_SP2_INC(x, 4, vx0, vx1);
232 LD_SP2_INC(y, 4, vy0, vy1);
233 PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
234 PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
236 dot0 += (vx0r * vy0r);
237 dot0 OP1 (vx0i * vy0i);
238 dot1 OP2 (vx0i * vy0r);
239 dot1 += (vx0r * vy0i);
244 LD_GP4_INC(x, 1, x0, x1, x2, x3);
245 LD_GP4_INC(y, 1, y0, y1, y2, y3);
247 dot[0] += (x0 * y0 OP3 x1 * y1);
248 dot[1] OP2 (x1 * y0 OP4 x0 * y1);
250 dot[0] += (x2 * y2 OP3 x3 * y3);
251 dot[1] OP2 (x3 * y2 OP4 x2 * y3);
256 LD_GP2_INC(x, 1, x0, x1);
257 LD_GP2_INC(y, 1, y0, y1);
259 dot[0] += (x0 * y0 OP3 x1 * y1);
260 dot[1] OP2 (x1 * y0 OP4 x0 * y1);
264 dot0 += dot2 + dot4 + dot6;
265 dot1 += dot3 + dot5 + dot7;
267 dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]);
268 dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]);
275 for (i = (n >> 2); i--;)
303 dot[0] += (x0 * y0 OP3 x1 * y1);
304 dot[1] OP2 (x1 * y0 OP4 x0 * y1);
306 dot[0] += (x2 * y2 OP3 x3 * y3);
307 dot[1] OP2 (x3 * y2 OP4 x2 * y3);
309 dot[0] += (x4 * y4 OP3 x5 * y5);
310 dot[1] OP2 (x5 * y4 OP4 x4 * y5);
312 dot[0] += (x6 * y6 OP3 x7 * y7);
313 dot[1] OP2 (x7 * y6 OP4 x6 * y7);
332 dot[0] += (x0 * y0 OP3 x1 * y1);
333 dot[1] OP2 (x1 * y0 OP4 x0 * y1);
335 dot[0] += (x2 * y2 OP3 x3 * y3);
336 dot[1] OP2 (x3 * y2 OP4 x2 * y3);
349 dot[0] += (x0 * y0 OP3 x1 * y1);
350 dot[1] OP2 (x1 * y0 OP4 x0 * y1);
354 CREAL(result) = dot[0];
355 CIMAG(result) = dot[1];