1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
30 #include "macros_msa.h"
32 #define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec))
34 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
38 v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
39 v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
40 v2f64 sum_abs0 = {0, 0};
41 v2f64 sum_abs1 = {0, 0};
42 v2f64 sum_abs2 = {0, 0};
43 v2f64 sum_abs3 = {0, 0};
44 v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
46 if (n <= 0 || inc_x <= 0) return (sumf);
55 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
58 pref_offset = L1_DATA_LINESIZE - pref_offset;
59 pref_offset = pref_offset / sizeof(FLOAT);
61 x_pref = x + pref_offset + 64 + 16;
63 LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
64 for (i = (n >> 5) - 1; i--;)
66 PREF_OFFSET(x_pref, 0);
67 PREF_OFFSET(x_pref, 32);
68 PREF_OFFSET(x_pref, 64);
69 PREF_OFFSET(x_pref, 96);
70 PREF_OFFSET(x_pref, 128);
71 PREF_OFFSET(x_pref, 160);
72 PREF_OFFSET(x_pref, 192);
73 PREF_OFFSET(x_pref, 224);
76 LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15);
78 sum_abs0 += AND_VEC_D(src0);
79 sum_abs1 += AND_VEC_D(src1);
80 sum_abs2 += AND_VEC_D(src2);
81 sum_abs3 += AND_VEC_D(src3);
82 sum_abs0 += AND_VEC_D(src4);
83 sum_abs1 += AND_VEC_D(src5);
84 sum_abs2 += AND_VEC_D(src6);
85 sum_abs3 += AND_VEC_D(src7);
87 LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
89 sum_abs0 += AND_VEC_D(src8);
90 sum_abs1 += AND_VEC_D(src9);
91 sum_abs2 += AND_VEC_D(src10);
92 sum_abs3 += AND_VEC_D(src11);
93 sum_abs0 += AND_VEC_D(src12);
94 sum_abs1 += AND_VEC_D(src13);
95 sum_abs2 += AND_VEC_D(src14);
96 sum_abs3 += AND_VEC_D(src15);
99 LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15);
101 sum_abs0 += AND_VEC_D(src0);
102 sum_abs1 += AND_VEC_D(src1);
103 sum_abs2 += AND_VEC_D(src2);
104 sum_abs3 += AND_VEC_D(src3);
105 sum_abs0 += AND_VEC_D(src4);
106 sum_abs1 += AND_VEC_D(src5);
107 sum_abs2 += AND_VEC_D(src6);
108 sum_abs3 += AND_VEC_D(src7);
109 sum_abs0 += AND_VEC_D(src8);
110 sum_abs1 += AND_VEC_D(src9);
111 sum_abs2 += AND_VEC_D(src10);
112 sum_abs3 += AND_VEC_D(src11);
113 sum_abs0 += AND_VEC_D(src12);
114 sum_abs1 += AND_VEC_D(src13);
115 sum_abs2 += AND_VEC_D(src14);
116 sum_abs3 += AND_VEC_D(src15);
123 LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
125 sum_abs0 += AND_VEC_D(src0);
126 sum_abs1 += AND_VEC_D(src1);
127 sum_abs2 += AND_VEC_D(src2);
128 sum_abs3 += AND_VEC_D(src3);
129 sum_abs0 += AND_VEC_D(src4);
130 sum_abs1 += AND_VEC_D(src5);
131 sum_abs2 += AND_VEC_D(src6);
132 sum_abs3 += AND_VEC_D(src7);
137 LD_DP4_INC(x, 2, src0, src1, src2, src3);
139 sum_abs0 += AND_VEC_D(src0);
140 sum_abs1 += AND_VEC_D(src1);
141 sum_abs2 += AND_VEC_D(src2);
142 sum_abs3 += AND_VEC_D(src3);
147 LD_DP2_INC(x, 2, src0, src1);
149 sum_abs0 += AND_VEC_D(src0);
150 sum_abs1 += AND_VEC_D(src1);
155 src0 = LD_DP(x); x += 2;
157 sum_abs0 += AND_VEC_D(src0);
166 sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
168 sumf += sum_abs0[0] + sum_abs0[1];
174 LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
175 for (i = (n >> 4) - 1; i--;)
177 LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15);
179 sum_abs0 += AND_VEC_D(src0);
180 sum_abs1 += AND_VEC_D(src1);
181 sum_abs2 += AND_VEC_D(src2);
182 sum_abs3 += AND_VEC_D(src3);
183 sum_abs0 += AND_VEC_D(src4);
184 sum_abs1 += AND_VEC_D(src5);
185 sum_abs2 += AND_VEC_D(src6);
186 sum_abs3 += AND_VEC_D(src7);
188 LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
190 sum_abs0 += AND_VEC_D(src8);
191 sum_abs1 += AND_VEC_D(src9);
192 sum_abs2 += AND_VEC_D(src10);
193 sum_abs3 += AND_VEC_D(src11);
194 sum_abs0 += AND_VEC_D(src12);
195 sum_abs1 += AND_VEC_D(src13);
196 sum_abs2 += AND_VEC_D(src14);
197 sum_abs3 += AND_VEC_D(src15);
200 LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15);
202 sum_abs0 += AND_VEC_D(src0);
203 sum_abs1 += AND_VEC_D(src1);
204 sum_abs2 += AND_VEC_D(src2);
205 sum_abs3 += AND_VEC_D(src3);
206 sum_abs0 += AND_VEC_D(src4);
207 sum_abs1 += AND_VEC_D(src5);
208 sum_abs2 += AND_VEC_D(src6);
209 sum_abs3 += AND_VEC_D(src7);
210 sum_abs0 += AND_VEC_D(src8);
211 sum_abs1 += AND_VEC_D(src9);
212 sum_abs2 += AND_VEC_D(src10);
213 sum_abs3 += AND_VEC_D(src11);
214 sum_abs0 += AND_VEC_D(src12);
215 sum_abs1 += AND_VEC_D(src13);
216 sum_abs2 += AND_VEC_D(src14);
217 sum_abs3 += AND_VEC_D(src15);
224 LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
226 sum_abs0 += AND_VEC_D(src0);
227 sum_abs1 += AND_VEC_D(src1);
228 sum_abs2 += AND_VEC_D(src2);
229 sum_abs3 += AND_VEC_D(src3);
230 sum_abs0 += AND_VEC_D(src4);
231 sum_abs1 += AND_VEC_D(src5);
232 sum_abs2 += AND_VEC_D(src6);
233 sum_abs3 += AND_VEC_D(src7);
238 LD_DP4_INC(x, inc_x, src0, src1, src2, src3);
240 sum_abs0 += AND_VEC_D(src0);
241 sum_abs1 += AND_VEC_D(src1);
242 sum_abs2 += AND_VEC_D(src2);
243 sum_abs3 += AND_VEC_D(src3);
248 LD_DP2_INC(x, inc_x, src0, src1);
250 sum_abs0 += AND_VEC_D(src0);
251 sum_abs1 += AND_VEC_D(src1);
258 sum_abs0 += AND_VEC_D(src0);
262 sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;