Optimize cscal function for POWER10
[platform/upstream/openblas.git] / kernel / power / zscal.c
1 /***************************************************************************
2 Copyright (c) 2013-2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28 /**************************************************************************************
29 * 2016/03/27 Werner Saar (wernsaar@googlemail.com)
30 *        BLASTEST               : OK
31 *        CTEST                  : OK
32 *        TEST                   : OK
33 *        LAPACK-TEST            : OK
34 **************************************************************************************/
35
36
37 #include "common.h"
38
39 #pragma GCC optimize "O1"
40
41 #if defined(__VEC__) || defined(__ALTIVEC__)
42 #if defined(POWER8) || defined(POWER9)
43 #if defined(DOUBLE)
44 #include "zscal_microk_power8.c"
45 #endif
46 #elif defined(POWER10)
47 #if defined(DOUBLE)
48 #include "zscal_microk_power8.c"
49 #else
50 #include "cscal_microk_power10.c"
51 #endif
52 #endif
53 #endif
54
55
56 #ifndef HAVE_KERNEL_8
57
58 static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT da_r, FLOAT da_i)
59 {
60
61         BLASLONG i=0;
62         FLOAT *x1=x;
63         FLOAT  alpha_r1=da_r;
64         FLOAT  alpha_r2=da_r;
65         FLOAT  alpha_i1=-da_i;
66         FLOAT  alpha_i2=da_i;
67         FLOAT  temp00, temp01, temp10, temp11, temp20, temp21, temp30, temp31;
68         FLOAT  x0_r, x0_i, x1_r, x1_i, x2_r, x2_i, x3_r, x3_i;
69
70         while ( i<n )
71         {
72                 x0_r = x1[0];
73                 x0_i = x1[1];
74                 x1_r = x1[2];
75                 x1_i = x1[3];
76                 x2_r = x1[4];
77                 x2_i = x1[5];
78                 x3_r = x1[6];
79                 x3_i = x1[7];
80
81                 temp00  = x0_r * alpha_r1;
82                 temp10  = x1_r * alpha_r1;
83                 temp20  = x2_r * alpha_r1;
84                 temp30  = x3_r * alpha_r1;
85
86                 temp01  = x0_i * alpha_r2;
87                 temp11  = x1_i * alpha_r2;
88                 temp21  = x2_i * alpha_r2;
89                 temp31  = x3_i * alpha_r2;
90
91                 temp00 += x0_i * alpha_i1;
92                 temp10 += x1_i * alpha_i1;
93                 temp20 += x2_i * alpha_i1;
94                 temp30 += x3_i * alpha_i1;
95
96                 temp01 += x0_r * alpha_i2;
97                 temp11 += x1_r * alpha_i2;
98                 temp21 += x2_r * alpha_i2;
99                 temp31 += x3_r * alpha_i2;
100
101                 x1[0] = temp00;
102                 x1[1] = temp01;
103                 x1[2] = temp10;
104                 x1[3] = temp11;
105                 x1[4] = temp20;
106                 x1[5] = temp21;
107                 x1[6] = temp30;
108                 x1[7] = temp31;
109
110                 x1 += 8;
111                 i+=4;
112
113         }
114         return;
115
116
117 }
118
119 #endif
120
121 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
122 {
123         BLASLONG i=0;
124         BLASLONG inc_x2;
125         BLASLONG ip = 0;
126         FLOAT temp;
127         BLASLONG n1;
128
129         if ( n <= 0 )
130                 return(0);
131
132         if ( inc_x <= 0 )
133                 return(0);
134
135         if (da_r == ZERO && da_i == ZERO) {
136           //clear the vector and return
137           if (inc_x == 1) {
138             memset(x, 0, n*COMPSIZE*SIZE);
139           }else{
140             inc_x2 = 2 * inc_x;
141             for(i=0; i<n; i++){
142               x[ip]=ZERO; 
143               x[ip+1]=ZERO;
144               ip += inc_x2;
145             }
146           }
147           return 0;
148         }
149
150         if ( inc_x == 1 )
151         {
152
153
154 #if defined(DOUBLE)
155                 n1 = n & -8;
156 #else
157                 n1 = n & -16;
158 #endif
159                 if ( n1 > 0 )
160                 {
161                         zscal_kernel_8(n1, x, da_r, da_i);
162                         i=n1;
163                         ip = n1 * 2;
164
165                 }
166
167                 while ( i < n )
168                 {
169
170                                 temp    = da_r * x[ip]   - da_i * x[ip+1] ;
171                                 x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
172                                 x[ip]   = temp;
173                                 ip += 2;
174                                 i++;
175                 }
176
177         }
178         else
179         {
180
181                 inc_x2 = 2 * inc_x;
182
183                 while ( i < n )
184                 {
185
186                                 temp    = da_r * x[ip]   - da_i * x[ip+1] ;
187                                 x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
188                                 x[ip]   = temp;
189                                 ip += inc_x2;
190                                 i++;
191                 }
192
193
194         }
195
196         return(0);
197
198 }
199
200