drm/amd/display: Add NULL test for 'timing generator' in 'dcn21_set_pipe()'
[platform/kernel/linux-starfive.git] / lib / raid6 / loongarch_simd.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX)
4  *
5  * Copyright 2023 WANG Xuerui <git@xen0n.name>
6  *
7  * Based on the generic RAID-6 code (int.uc):
8  *
9  * Copyright 2002-2004 H. Peter Anvin
10  */
11
12 #include <linux/raid/pq.h>
13 #include "loongarch.h"
14
15 /*
16  * The vector algorithms are currently priority 0, which means the generic
17  * scalar algorithms are not being disabled if vector support is present.
18  * This is like the similar LoongArch RAID5 XOR code, with the main reason
19  * repeated here: it cannot be ruled out at this point of time, that some
20  * future (maybe reduced) models could run the vector algorithms slower than
21  * the scalar ones, maybe for errata or micro-op reasons. It may be
22  * appropriate to revisit this after one or two more uarch generations.
23  */
24
25 #ifdef CONFIG_CPU_HAS_LSX
26 #define NSIZE 16
27
28 static int raid6_has_lsx(void)
29 {
30         return cpu_has_lsx;
31 }
32
33 static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs)
34 {
35         u8 **dptr = (u8 **)ptrs;
36         u8 *p, *q;
37         int d, z, z0;
38
39         z0 = disks - 3;         /* Highest data disk */
40         p = dptr[z0+1];         /* XOR parity */
41         q = dptr[z0+2];         /* RS syndrome */
42
43         kernel_fpu_begin();
44
45         /*
46          * $vr0, $vr1, $vr2, $vr3: wp
47          * $vr4, $vr5, $vr6, $vr7: wq
48          * $vr8, $vr9, $vr10, $vr11: wd
49          * $vr12, $vr13, $vr14, $vr15: w2
50          * $vr16, $vr17, $vr18, $vr19: w1
51          */
52         for (d = 0; d < bytes; d += NSIZE*4) {
53                 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
54                 asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
55                 asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
56                 asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
57                 asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
58                 asm volatile("vori.b $vr4, $vr0, 0");
59                 asm volatile("vori.b $vr5, $vr1, 0");
60                 asm volatile("vori.b $vr6, $vr2, 0");
61                 asm volatile("vori.b $vr7, $vr3, 0");
62                 for (z = z0-1; z >= 0; z--) {
63                         /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
64                         asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
65                         asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
66                         asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
67                         asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
68                         /* wp$$ ^= wd$$; */
69                         asm volatile("vxor.v $vr0, $vr0, $vr8");
70                         asm volatile("vxor.v $vr1, $vr1, $vr9");
71                         asm volatile("vxor.v $vr2, $vr2, $vr10");
72                         asm volatile("vxor.v $vr3, $vr3, $vr11");
73                         /* w2$$ = MASK(wq$$); */
74                         asm volatile("vslti.b $vr12, $vr4, 0");
75                         asm volatile("vslti.b $vr13, $vr5, 0");
76                         asm volatile("vslti.b $vr14, $vr6, 0");
77                         asm volatile("vslti.b $vr15, $vr7, 0");
78                         /* w1$$ = SHLBYTE(wq$$); */
79                         asm volatile("vslli.b $vr16, $vr4, 1");
80                         asm volatile("vslli.b $vr17, $vr5, 1");
81                         asm volatile("vslli.b $vr18, $vr6, 1");
82                         asm volatile("vslli.b $vr19, $vr7, 1");
83                         /* w2$$ &= NBYTES(0x1d); */
84                         asm volatile("vandi.b $vr12, $vr12, 0x1d");
85                         asm volatile("vandi.b $vr13, $vr13, 0x1d");
86                         asm volatile("vandi.b $vr14, $vr14, 0x1d");
87                         asm volatile("vandi.b $vr15, $vr15, 0x1d");
88                         /* w1$$ ^= w2$$; */
89                         asm volatile("vxor.v $vr16, $vr16, $vr12");
90                         asm volatile("vxor.v $vr17, $vr17, $vr13");
91                         asm volatile("vxor.v $vr18, $vr18, $vr14");
92                         asm volatile("vxor.v $vr19, $vr19, $vr15");
93                         /* wq$$ = w1$$ ^ wd$$; */
94                         asm volatile("vxor.v $vr4, $vr16, $vr8");
95                         asm volatile("vxor.v $vr5, $vr17, $vr9");
96                         asm volatile("vxor.v $vr6, $vr18, $vr10");
97                         asm volatile("vxor.v $vr7, $vr19, $vr11");
98                 }
99                 /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
100                 asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0]));
101                 asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1]));
102                 asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2]));
103                 asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3]));
104                 /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
105                 asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0]));
106                 asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1]));
107                 asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2]));
108                 asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3]));
109         }
110
111         kernel_fpu_end();
112 }
113
114 static void raid6_lsx_xor_syndrome(int disks, int start, int stop,
115                                    size_t bytes, void **ptrs)
116 {
117         u8 **dptr = (u8 **)ptrs;
118         u8 *p, *q;
119         int d, z, z0;
120
121         z0 = stop;              /* P/Q right side optimization */
122         p = dptr[disks-2];      /* XOR parity */
123         q = dptr[disks-1];      /* RS syndrome */
124
125         kernel_fpu_begin();
126
127         /*
128          * $vr0, $vr1, $vr2, $vr3: wp
129          * $vr4, $vr5, $vr6, $vr7: wq
130          * $vr8, $vr9, $vr10, $vr11: wd
131          * $vr12, $vr13, $vr14, $vr15: w2
132          * $vr16, $vr17, $vr18, $vr19: w1
133          */
134         for (d = 0; d < bytes; d += NSIZE*4) {
135                 /* P/Q data pages */
136                 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
137                 asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
138                 asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
139                 asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
140                 asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
141                 asm volatile("vori.b $vr4, $vr0, 0");
142                 asm volatile("vori.b $vr5, $vr1, 0");
143                 asm volatile("vori.b $vr6, $vr2, 0");
144                 asm volatile("vori.b $vr7, $vr3, 0");
145                 for (z = z0-1; z >= start; z--) {
146                         /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
147                         asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
148                         asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
149                         asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
150                         asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
151                         /* wp$$ ^= wd$$; */
152                         asm volatile("vxor.v $vr0, $vr0, $vr8");
153                         asm volatile("vxor.v $vr1, $vr1, $vr9");
154                         asm volatile("vxor.v $vr2, $vr2, $vr10");
155                         asm volatile("vxor.v $vr3, $vr3, $vr11");
156                         /* w2$$ = MASK(wq$$); */
157                         asm volatile("vslti.b $vr12, $vr4, 0");
158                         asm volatile("vslti.b $vr13, $vr5, 0");
159                         asm volatile("vslti.b $vr14, $vr6, 0");
160                         asm volatile("vslti.b $vr15, $vr7, 0");
161                         /* w1$$ = SHLBYTE(wq$$); */
162                         asm volatile("vslli.b $vr16, $vr4, 1");
163                         asm volatile("vslli.b $vr17, $vr5, 1");
164                         asm volatile("vslli.b $vr18, $vr6, 1");
165                         asm volatile("vslli.b $vr19, $vr7, 1");
166                         /* w2$$ &= NBYTES(0x1d); */
167                         asm volatile("vandi.b $vr12, $vr12, 0x1d");
168                         asm volatile("vandi.b $vr13, $vr13, 0x1d");
169                         asm volatile("vandi.b $vr14, $vr14, 0x1d");
170                         asm volatile("vandi.b $vr15, $vr15, 0x1d");
171                         /* w1$$ ^= w2$$; */
172                         asm volatile("vxor.v $vr16, $vr16, $vr12");
173                         asm volatile("vxor.v $vr17, $vr17, $vr13");
174                         asm volatile("vxor.v $vr18, $vr18, $vr14");
175                         asm volatile("vxor.v $vr19, $vr19, $vr15");
176                         /* wq$$ = w1$$ ^ wd$$; */
177                         asm volatile("vxor.v $vr4, $vr16, $vr8");
178                         asm volatile("vxor.v $vr5, $vr17, $vr9");
179                         asm volatile("vxor.v $vr6, $vr18, $vr10");
180                         asm volatile("vxor.v $vr7, $vr19, $vr11");
181                 }
182
183                 /* P/Q left side optimization */
184                 for (z = start-1; z >= 0; z--) {
185                         /* w2$$ = MASK(wq$$); */
186                         asm volatile("vslti.b $vr12, $vr4, 0");
187                         asm volatile("vslti.b $vr13, $vr5, 0");
188                         asm volatile("vslti.b $vr14, $vr6, 0");
189                         asm volatile("vslti.b $vr15, $vr7, 0");
190                         /* w1$$ = SHLBYTE(wq$$); */
191                         asm volatile("vslli.b $vr16, $vr4, 1");
192                         asm volatile("vslli.b $vr17, $vr5, 1");
193                         asm volatile("vslli.b $vr18, $vr6, 1");
194                         asm volatile("vslli.b $vr19, $vr7, 1");
195                         /* w2$$ &= NBYTES(0x1d); */
196                         asm volatile("vandi.b $vr12, $vr12, 0x1d");
197                         asm volatile("vandi.b $vr13, $vr13, 0x1d");
198                         asm volatile("vandi.b $vr14, $vr14, 0x1d");
199                         asm volatile("vandi.b $vr15, $vr15, 0x1d");
200                         /* wq$$ = w1$$ ^ w2$$; */
201                         asm volatile("vxor.v $vr4, $vr16, $vr12");
202                         asm volatile("vxor.v $vr5, $vr17, $vr13");
203                         asm volatile("vxor.v $vr6, $vr18, $vr14");
204                         asm volatile("vxor.v $vr7, $vr19, $vr15");
205                 }
206                 /*
207                  * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
208                  * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
209                  */
210                 asm volatile(
211                         "vld $vr20, %0\n\t"
212                         "vld $vr21, %1\n\t"
213                         "vld $vr22, %2\n\t"
214                         "vld $vr23, %3\n\t"
215                         "vld $vr24, %4\n\t"
216                         "vld $vr25, %5\n\t"
217                         "vld $vr26, %6\n\t"
218                         "vld $vr27, %7\n\t"
219                         "vxor.v $vr20, $vr20, $vr0\n\t"
220                         "vxor.v $vr21, $vr21, $vr1\n\t"
221                         "vxor.v $vr22, $vr22, $vr2\n\t"
222                         "vxor.v $vr23, $vr23, $vr3\n\t"
223                         "vxor.v $vr24, $vr24, $vr4\n\t"
224                         "vxor.v $vr25, $vr25, $vr5\n\t"
225                         "vxor.v $vr26, $vr26, $vr6\n\t"
226                         "vxor.v $vr27, $vr27, $vr7\n\t"
227                         "vst $vr20, %0\n\t"
228                         "vst $vr21, %1\n\t"
229                         "vst $vr22, %2\n\t"
230                         "vst $vr23, %3\n\t"
231                         "vst $vr24, %4\n\t"
232                         "vst $vr25, %5\n\t"
233                         "vst $vr26, %6\n\t"
234                         "vst $vr27, %7\n\t"
235                         : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
236                           "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]),
237                           "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]),
238                           "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3])
239                 );
240         }
241
242         kernel_fpu_end();
243 }
244
245 const struct raid6_calls raid6_lsx = {
246         raid6_lsx_gen_syndrome,
247         raid6_lsx_xor_syndrome,
248         raid6_has_lsx,
249         "lsx",
250         .priority = 0 /* see the comment near the top of the file for reason */
251 };
252
253 #undef NSIZE
254 #endif /* CONFIG_CPU_HAS_LSX */
255
256 #ifdef CONFIG_CPU_HAS_LASX
257 #define NSIZE 32
258
259 static int raid6_has_lasx(void)
260 {
261         return cpu_has_lasx;
262 }
263
264 static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs)
265 {
266         u8 **dptr = (u8 **)ptrs;
267         u8 *p, *q;
268         int d, z, z0;
269
270         z0 = disks - 3;         /* Highest data disk */
271         p = dptr[z0+1];         /* XOR parity */
272         q = dptr[z0+2];         /* RS syndrome */
273
274         kernel_fpu_begin();
275
276         /*
277          * $xr0, $xr1: wp
278          * $xr2, $xr3: wq
279          * $xr4, $xr5: wd
280          * $xr6, $xr7: w2
281          * $xr8, $xr9: w1
282          */
283         for (d = 0; d < bytes; d += NSIZE*2) {
284                 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
285                 asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
286                 asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
287                 asm volatile("xvori.b $xr2, $xr0, 0");
288                 asm volatile("xvori.b $xr3, $xr1, 0");
289                 for (z = z0-1; z >= 0; z--) {
290                         /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
291                         asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
292                         asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
293                         /* wp$$ ^= wd$$; */
294                         asm volatile("xvxor.v $xr0, $xr0, $xr4");
295                         asm volatile("xvxor.v $xr1, $xr1, $xr5");
296                         /* w2$$ = MASK(wq$$); */
297                         asm volatile("xvslti.b $xr6, $xr2, 0");
298                         asm volatile("xvslti.b $xr7, $xr3, 0");
299                         /* w1$$ = SHLBYTE(wq$$); */
300                         asm volatile("xvslli.b $xr8, $xr2, 1");
301                         asm volatile("xvslli.b $xr9, $xr3, 1");
302                         /* w2$$ &= NBYTES(0x1d); */
303                         asm volatile("xvandi.b $xr6, $xr6, 0x1d");
304                         asm volatile("xvandi.b $xr7, $xr7, 0x1d");
305                         /* w1$$ ^= w2$$; */
306                         asm volatile("xvxor.v $xr8, $xr8, $xr6");
307                         asm volatile("xvxor.v $xr9, $xr9, $xr7");
308                         /* wq$$ = w1$$ ^ wd$$; */
309                         asm volatile("xvxor.v $xr2, $xr8, $xr4");
310                         asm volatile("xvxor.v $xr3, $xr9, $xr5");
311                 }
312                 /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
313                 asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0]));
314                 asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1]));
315                 /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
316                 asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0]));
317                 asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1]));
318         }
319
320         kernel_fpu_end();
321 }
322
323 static void raid6_lasx_xor_syndrome(int disks, int start, int stop,
324                                     size_t bytes, void **ptrs)
325 {
326         u8 **dptr = (u8 **)ptrs;
327         u8 *p, *q;
328         int d, z, z0;
329
330         z0 = stop;              /* P/Q right side optimization */
331         p = dptr[disks-2];      /* XOR parity */
332         q = dptr[disks-1];      /* RS syndrome */
333
334         kernel_fpu_begin();
335
336         /*
337          * $xr0, $xr1: wp
338          * $xr2, $xr3: wq
339          * $xr4, $xr5: wd
340          * $xr6, $xr7: w2
341          * $xr8, $xr9: w1
342          */
343         for (d = 0; d < bytes; d += NSIZE*2) {
344                 /* P/Q data pages */
345                 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
346                 asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
347                 asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
348                 asm volatile("xvori.b $xr2, $xr0, 0");
349                 asm volatile("xvori.b $xr3, $xr1, 0");
350                 for (z = z0-1; z >= start; z--) {
351                         /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
352                         asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
353                         asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
354                         /* wp$$ ^= wd$$; */
355                         asm volatile("xvxor.v $xr0, $xr0, $xr4");
356                         asm volatile("xvxor.v $xr1, $xr1, $xr5");
357                         /* w2$$ = MASK(wq$$); */
358                         asm volatile("xvslti.b $xr6, $xr2, 0");
359                         asm volatile("xvslti.b $xr7, $xr3, 0");
360                         /* w1$$ = SHLBYTE(wq$$); */
361                         asm volatile("xvslli.b $xr8, $xr2, 1");
362                         asm volatile("xvslli.b $xr9, $xr3, 1");
363                         /* w2$$ &= NBYTES(0x1d); */
364                         asm volatile("xvandi.b $xr6, $xr6, 0x1d");
365                         asm volatile("xvandi.b $xr7, $xr7, 0x1d");
366                         /* w1$$ ^= w2$$; */
367                         asm volatile("xvxor.v $xr8, $xr8, $xr6");
368                         asm volatile("xvxor.v $xr9, $xr9, $xr7");
369                         /* wq$$ = w1$$ ^ wd$$; */
370                         asm volatile("xvxor.v $xr2, $xr8, $xr4");
371                         asm volatile("xvxor.v $xr3, $xr9, $xr5");
372                 }
373
374                 /* P/Q left side optimization */
375                 for (z = start-1; z >= 0; z--) {
376                         /* w2$$ = MASK(wq$$); */
377                         asm volatile("xvslti.b $xr6, $xr2, 0");
378                         asm volatile("xvslti.b $xr7, $xr3, 0");
379                         /* w1$$ = SHLBYTE(wq$$); */
380                         asm volatile("xvslli.b $xr8, $xr2, 1");
381                         asm volatile("xvslli.b $xr9, $xr3, 1");
382                         /* w2$$ &= NBYTES(0x1d); */
383                         asm volatile("xvandi.b $xr6, $xr6, 0x1d");
384                         asm volatile("xvandi.b $xr7, $xr7, 0x1d");
385                         /* wq$$ = w1$$ ^ w2$$; */
386                         asm volatile("xvxor.v $xr2, $xr8, $xr6");
387                         asm volatile("xvxor.v $xr3, $xr9, $xr7");
388                 }
389                 /*
390                  * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
391                  * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
392                  */
393                 asm volatile(
394                         "xvld $xr10, %0\n\t"
395                         "xvld $xr11, %1\n\t"
396                         "xvld $xr12, %2\n\t"
397                         "xvld $xr13, %3\n\t"
398                         "xvxor.v $xr10, $xr10, $xr0\n\t"
399                         "xvxor.v $xr11, $xr11, $xr1\n\t"
400                         "xvxor.v $xr12, $xr12, $xr2\n\t"
401                         "xvxor.v $xr13, $xr13, $xr3\n\t"
402                         "xvst $xr10, %0\n\t"
403                         "xvst $xr11, %1\n\t"
404                         "xvst $xr12, %2\n\t"
405                         "xvst $xr13, %3\n\t"
406                         : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
407                           "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1])
408                 );
409         }
410
411         kernel_fpu_end();
412 }
413
414 const struct raid6_calls raid6_lasx = {
415         raid6_lasx_gen_syndrome,
416         raid6_lasx_xor_syndrome,
417         raid6_has_lasx,
418         "lasx",
419         .priority = 0 /* see the comment near the top of the file for reason */
420 };
421 #undef NSIZE
422 #endif /* CONFIG_CPU_HAS_LASX */