1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
30 #include "macros_msa.h"
32 #define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec))
34 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
38 v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
39 v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
40 v4f32 sum_abs0 = {0, 0, 0, 0};
41 v4f32 sum_abs1 = {0, 0, 0, 0};
42 v4f32 sum_abs2 = {0, 0, 0, 0};
43 v4f32 sum_abs3 = {0, 0, 0, 0};
44 v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
46 if (n <= 0 || inc_x <= 0) return (sumf);
55 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
58 pref_offset = L1_DATA_LINESIZE - pref_offset;
59 pref_offset = pref_offset / sizeof(FLOAT);
61 x_pref = x + pref_offset + 128 + 32;
63 LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
64 for (i = (n >> 5) - 1; i--;)
66 PREF_OFFSET(x_pref, 0);
67 PREF_OFFSET(x_pref, 32);
68 PREF_OFFSET(x_pref, 64);
69 PREF_OFFSET(x_pref, 96);
70 PREF_OFFSET(x_pref, 128);
71 PREF_OFFSET(x_pref, 160);
72 PREF_OFFSET(x_pref, 192);
73 PREF_OFFSET(x_pref, 224);
76 LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15);
78 sum_abs0 += AND_VEC_W(src0);
79 sum_abs1 += AND_VEC_W(src1);
80 sum_abs2 += AND_VEC_W(src2);
81 sum_abs3 += AND_VEC_W(src3);
82 sum_abs0 += AND_VEC_W(src4);
83 sum_abs1 += AND_VEC_W(src5);
84 sum_abs2 += AND_VEC_W(src6);
85 sum_abs3 += AND_VEC_W(src7);
87 LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
89 sum_abs0 += AND_VEC_W(src8);
90 sum_abs1 += AND_VEC_W(src9);
91 sum_abs2 += AND_VEC_W(src10);
92 sum_abs3 += AND_VEC_W(src11);
93 sum_abs0 += AND_VEC_W(src12);
94 sum_abs1 += AND_VEC_W(src13);
95 sum_abs2 += AND_VEC_W(src14);
96 sum_abs3 += AND_VEC_W(src15);
99 LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15);
101 sum_abs0 += AND_VEC_W(src0);
102 sum_abs1 += AND_VEC_W(src1);
103 sum_abs2 += AND_VEC_W(src2);
104 sum_abs3 += AND_VEC_W(src3);
105 sum_abs0 += AND_VEC_W(src4);
106 sum_abs1 += AND_VEC_W(src5);
107 sum_abs2 += AND_VEC_W(src6);
108 sum_abs3 += AND_VEC_W(src7);
109 sum_abs0 += AND_VEC_W(src8);
110 sum_abs1 += AND_VEC_W(src9);
111 sum_abs2 += AND_VEC_W(src10);
112 sum_abs3 += AND_VEC_W(src11);
113 sum_abs0 += AND_VEC_W(src12);
114 sum_abs1 += AND_VEC_W(src13);
115 sum_abs2 += AND_VEC_W(src14);
116 sum_abs3 += AND_VEC_W(src15);
123 LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
125 sum_abs0 += AND_VEC_W(src0);
126 sum_abs1 += AND_VEC_W(src1);
127 sum_abs2 += AND_VEC_W(src2);
128 sum_abs3 += AND_VEC_W(src3);
129 sum_abs0 += AND_VEC_W(src4);
130 sum_abs1 += AND_VEC_W(src5);
131 sum_abs2 += AND_VEC_W(src6);
132 sum_abs3 += AND_VEC_W(src7);
137 LD_SP4_INC(x, 4, src0, src1, src2, src3);
139 sum_abs0 += AND_VEC_W(src0);
140 sum_abs1 += AND_VEC_W(src1);
141 sum_abs2 += AND_VEC_W(src2);
142 sum_abs3 += AND_VEC_W(src3);
147 LD_SP2_INC(x, 4, src0, src1);
149 sum_abs0 += AND_VEC_W(src0);
150 sum_abs1 += AND_VEC_W(src1);
155 src0 = LD_SP(x); x += 4;
157 sum_abs0 += AND_VEC_W(src0);
163 sumf += fabsf(*(x + 1));
167 sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
180 LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
181 for (i = (n >> 4) - 1; i--;)
183 LD_SP8_INC(x, inc_x2, src8, src9, src10, src11, src12, src13, src14, src15);
185 sum_abs0 += AND_VEC_W(src0);
186 sum_abs1 += AND_VEC_W(src1);
187 sum_abs2 += AND_VEC_W(src2);
188 sum_abs3 += AND_VEC_W(src3);
189 sum_abs0 += AND_VEC_W(src4);
190 sum_abs1 += AND_VEC_W(src5);
191 sum_abs2 += AND_VEC_W(src6);
192 sum_abs3 += AND_VEC_W(src7);
194 LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
196 sum_abs0 += AND_VEC_W(src8);
197 sum_abs1 += AND_VEC_W(src9);
198 sum_abs2 += AND_VEC_W(src10);
199 sum_abs3 += AND_VEC_W(src11);
200 sum_abs0 += AND_VEC_W(src12);
201 sum_abs1 += AND_VEC_W(src13);
202 sum_abs2 += AND_VEC_W(src14);
203 sum_abs3 += AND_VEC_W(src15);
206 LD_SP8_INC(x, inc_x2, src8, src9, src10, src11, src12, src13, src14, src15);
208 sum_abs0 += AND_VEC_W(src0);
209 sum_abs1 += AND_VEC_W(src1);
210 sum_abs2 += AND_VEC_W(src2);
211 sum_abs3 += AND_VEC_W(src3);
212 sum_abs0 += AND_VEC_W(src4);
213 sum_abs1 += AND_VEC_W(src5);
214 sum_abs2 += AND_VEC_W(src6);
215 sum_abs3 += AND_VEC_W(src7);
216 sum_abs0 += AND_VEC_W(src8);
217 sum_abs1 += AND_VEC_W(src9);
218 sum_abs2 += AND_VEC_W(src10);
219 sum_abs3 += AND_VEC_W(src11);
220 sum_abs0 += AND_VEC_W(src12);
221 sum_abs1 += AND_VEC_W(src13);
222 sum_abs2 += AND_VEC_W(src14);
223 sum_abs3 += AND_VEC_W(src15);
230 LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
232 sum_abs0 += AND_VEC_W(src0);
233 sum_abs1 += AND_VEC_W(src1);
234 sum_abs2 += AND_VEC_W(src2);
235 sum_abs3 += AND_VEC_W(src3);
236 sum_abs0 += AND_VEC_W(src4);
237 sum_abs1 += AND_VEC_W(src5);
238 sum_abs2 += AND_VEC_W(src6);
239 sum_abs3 += AND_VEC_W(src7);
244 LD_SP4_INC(x, inc_x2, src0, src1, src2, src3);
246 sum_abs0 += AND_VEC_W(src0);
247 sum_abs1 += AND_VEC_W(src1);
248 sum_abs2 += AND_VEC_W(src2);
249 sum_abs3 += AND_VEC_W(src3);
254 LD_SP2_INC(x, inc_x2, src0, src1);
256 sum_abs0 += AND_VEC_W(src0);
257 sum_abs1 += AND_VEC_W(src1);
264 sum_abs0 += AND_VEC_W(src0);
268 sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
270 sumf = sum_abs0[0] + sum_abs0[1];