1 /***************************************************************************
2 Copyright (c) 2013-2020, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
34 #define STACKSIZE (512 )
35 #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
50 #define save_permute_1 vs59
51 #define permute_mask vs63
77 #include "cgemm_macros_power10.S"
79 .equ perm_const1, 0x0405060700010203
80 .equ perm_const2, 0x0c0d0e0f08090a0b
81 .equ save_permute_12, 0x0c0d0e0f1c1d1e1f
82 .equ save_permute_11, 0x0405060714151617
92 addi SP, SP, -STACKSIZE
152 std r0, FLINK_SAVE(SP)
156 ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
161 ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
163 slwi LDC, LDC, ZBASE_SHIFT
167 /*alpha is stored in f1. convert to single and splat*/
168 xscvdpspn alpha_r,vs1
169 xscvdpspn alpha_i,vs2
170 xxspltw alpha_r,alpha_r,0
171 xxspltw alpha_i,alpha_i,0
172 /*load reverse permute mask for big endian
173 uint128 = 0xc0d0e0f08090a0b0405060700010203
176 lis T2, perm_const2@highest
177 lis T1, perm_const1@highest
178 lis T3, save_permute_12@highest
179 lis T4, save_permute_11@highest
182 ori T2, T2, perm_const2@higher
183 ori T1, T1, perm_const1@higher
184 ori T3, T3, save_permute_12@higher
185 ori T4, T4, save_permute_11@higher
188 rldicr T2, T2, 32, 31
189 rldicr T1, T1, 32, 31
190 rldicr T3, T3, 32, 31
191 rldicr T4, T4, 32, 31
193 oris T2, T2, perm_const2@h
194 oris T1, T1, perm_const1@h
195 oris T3, T3, save_permute_12@h
196 oris T4, T4, save_permute_11@h
199 ori T2, T2, perm_const2@l
200 ori T1, T1, perm_const1@l
201 ori T3, T3, save_permute_12@l
202 ori T4, T4, save_permute_11@l
208 #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
209 /*negate for this case as we will use addition -1*(a+b) */
210 xvnegsp alpha_r,alpha_r
211 xvnegsp alpha_i,alpha_i
214 mtvsrdd permute_mask,T2,T1
215 mtvsrdd save_permute_1,T3,T4
217 /*mask is reverse permute so we have to make it inner permute */
218 xxpermdi permute_mask, permute_mask, permute_mask,2
220 #include "cgemm_logic_power10.S"
265 ld r0, FLINK_SAVE(SP)
281 addi SP, SP, STACKSIZE