Merge tag 'powerpc-6.6-6' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
[platform/kernel/linux-starfive.git] / arch / x86 / kernel / fpu / xstate.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * xsave/xrstor support.
4  *
5  * Author: Suresh Siddha <suresh.b.siddha@intel.com>
6  */
7 #include <linux/bitops.h>
8 #include <linux/compat.h>
9 #include <linux/cpu.h>
10 #include <linux/mman.h>
11 #include <linux/nospec.h>
12 #include <linux/pkeys.h>
13 #include <linux/seq_file.h>
14 #include <linux/proc_fs.h>
15 #include <linux/vmalloc.h>
16
17 #include <asm/fpu/api.h>
18 #include <asm/fpu/regset.h>
19 #include <asm/fpu/signal.h>
20 #include <asm/fpu/xcr.h>
21
22 #include <asm/tlbflush.h>
23 #include <asm/prctl.h>
24 #include <asm/elf.h>
25
26 #include "context.h"
27 #include "internal.h"
28 #include "legacy.h"
29 #include "xstate.h"
30
31 #define for_each_extended_xfeature(bit, mask)                           \
32         (bit) = FIRST_EXTENDED_XFEATURE;                                \
33         for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
34
35 /*
36  * Although we spell it out in here, the Processor Trace
37  * xfeature is completely unused.  We use other mechanisms
38  * to save/restore PT state in Linux.
39  */
40 static const char *xfeature_names[] =
41 {
42         "x87 floating point registers",
43         "SSE registers",
44         "AVX registers",
45         "MPX bounds registers",
46         "MPX CSR",
47         "AVX-512 opmask",
48         "AVX-512 Hi256",
49         "AVX-512 ZMM_Hi256",
50         "Processor Trace (unused)",
51         "Protection Keys User registers",
52         "PASID state",
53         "Control-flow User registers",
54         "Control-flow Kernel registers (unused)",
55         "unknown xstate feature",
56         "unknown xstate feature",
57         "unknown xstate feature",
58         "unknown xstate feature",
59         "AMX Tile config",
60         "AMX Tile data",
61         "unknown xstate feature",
62 };
63
64 static unsigned short xsave_cpuid_features[] __initdata = {
65         [XFEATURE_FP]                           = X86_FEATURE_FPU,
66         [XFEATURE_SSE]                          = X86_FEATURE_XMM,
67         [XFEATURE_YMM]                          = X86_FEATURE_AVX,
68         [XFEATURE_BNDREGS]                      = X86_FEATURE_MPX,
69         [XFEATURE_BNDCSR]                       = X86_FEATURE_MPX,
70         [XFEATURE_OPMASK]                       = X86_FEATURE_AVX512F,
71         [XFEATURE_ZMM_Hi256]                    = X86_FEATURE_AVX512F,
72         [XFEATURE_Hi16_ZMM]                     = X86_FEATURE_AVX512F,
73         [XFEATURE_PT_UNIMPLEMENTED_SO_FAR]      = X86_FEATURE_INTEL_PT,
74         [XFEATURE_PKRU]                         = X86_FEATURE_OSPKE,
75         [XFEATURE_PASID]                        = X86_FEATURE_ENQCMD,
76         [XFEATURE_CET_USER]                     = X86_FEATURE_SHSTK,
77         [XFEATURE_XTILE_CFG]                    = X86_FEATURE_AMX_TILE,
78         [XFEATURE_XTILE_DATA]                   = X86_FEATURE_AMX_TILE,
79 };
80
81 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
82         { [ 0 ... XFEATURE_MAX - 1] = -1};
83 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
84         { [ 0 ... XFEATURE_MAX - 1] = -1};
85 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
86
87 #define XSTATE_FLAG_SUPERVISOR  BIT(0)
88 #define XSTATE_FLAG_ALIGNED64   BIT(1)
89
90 /*
91  * Return whether the system supports a given xfeature.
92  *
93  * Also return the name of the (most advanced) feature that the caller requested:
94  */
95 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
96 {
97         u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
98
99         if (unlikely(feature_name)) {
100                 long xfeature_idx, max_idx;
101                 u64 xfeatures_print;
102                 /*
103                  * So we use FLS here to be able to print the most advanced
104                  * feature that was requested but is missing. So if a driver
105                  * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
106                  * missing AVX feature - this is the most informative message
107                  * to users:
108                  */
109                 if (xfeatures_missing)
110                         xfeatures_print = xfeatures_missing;
111                 else
112                         xfeatures_print = xfeatures_needed;
113
114                 xfeature_idx = fls64(xfeatures_print)-1;
115                 max_idx = ARRAY_SIZE(xfeature_names)-1;
116                 xfeature_idx = min(xfeature_idx, max_idx);
117
118                 *feature_name = xfeature_names[xfeature_idx];
119         }
120
121         if (xfeatures_missing)
122                 return 0;
123
124         return 1;
125 }
126 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
127
128 static bool xfeature_is_aligned64(int xfeature_nr)
129 {
130         return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
131 }
132
133 static bool xfeature_is_supervisor(int xfeature_nr)
134 {
135         return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
136 }
137
138 static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
139 {
140         unsigned int offs, i;
141
142         /*
143          * Non-compacted format and legacy features use the cached fixed
144          * offsets.
145          */
146         if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
147             xfeature <= XFEATURE_SSE)
148                 return xstate_offsets[xfeature];
149
150         /*
151          * Compacted format offsets depend on the actual content of the
152          * compacted xsave area which is determined by the xcomp_bv header
153          * field.
154          */
155         offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
156         for_each_extended_xfeature(i, xcomp_bv) {
157                 if (xfeature_is_aligned64(i))
158                         offs = ALIGN(offs, 64);
159                 if (i == xfeature)
160                         break;
161                 offs += xstate_sizes[i];
162         }
163         return offs;
164 }
165
166 /*
167  * Enable the extended processor state save/restore feature.
168  * Called once per CPU onlining.
169  */
170 void fpu__init_cpu_xstate(void)
171 {
172         if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
173                 return;
174
175         cr4_set_bits(X86_CR4_OSXSAVE);
176
177         /*
178          * Must happen after CR4 setup and before xsetbv() to allow KVM
179          * lazy passthrough.  Write independent of the dynamic state static
180          * key as that does not work on the boot CPU. This also ensures
181          * that any stale state is wiped out from XFD.
182          */
183         if (cpu_feature_enabled(X86_FEATURE_XFD))
184                 wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
185
186         /*
187          * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
188          * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
189          * states can be set here.
190          */
191         xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
192
193         /*
194          * MSR_IA32_XSS sets supervisor states managed by XSAVES.
195          */
196         if (boot_cpu_has(X86_FEATURE_XSAVES)) {
197                 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
198                                      xfeatures_mask_independent());
199         }
200 }
201
202 static bool xfeature_enabled(enum xfeature xfeature)
203 {
204         return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
205 }
206
207 /*
208  * Record the offsets and sizes of various xstates contained
209  * in the XSAVE state memory layout.
210  */
211 static void __init setup_xstate_cache(void)
212 {
213         u32 eax, ebx, ecx, edx, i;
214         /* start at the beginning of the "extended state" */
215         unsigned int last_good_offset = offsetof(struct xregs_state,
216                                                  extended_state_area);
217         /*
218          * The FP xstates and SSE xstates are legacy states. They are always
219          * in the fixed offsets in the xsave area in either compacted form
220          * or standard form.
221          */
222         xstate_offsets[XFEATURE_FP]     = 0;
223         xstate_sizes[XFEATURE_FP]       = offsetof(struct fxregs_state,
224                                                    xmm_space);
225
226         xstate_offsets[XFEATURE_SSE]    = xstate_sizes[XFEATURE_FP];
227         xstate_sizes[XFEATURE_SSE]      = sizeof_field(struct fxregs_state,
228                                                        xmm_space);
229
230         for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
231                 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
232
233                 xstate_sizes[i] = eax;
234                 xstate_flags[i] = ecx;
235
236                 /*
237                  * If an xfeature is supervisor state, the offset in EBX is
238                  * invalid, leave it to -1.
239                  */
240                 if (xfeature_is_supervisor(i))
241                         continue;
242
243                 xstate_offsets[i] = ebx;
244
245                 /*
246                  * In our xstate size checks, we assume that the highest-numbered
247                  * xstate feature has the highest offset in the buffer.  Ensure
248                  * it does.
249                  */
250                 WARN_ONCE(last_good_offset > xstate_offsets[i],
251                           "x86/fpu: misordered xstate at %d\n", last_good_offset);
252
253                 last_good_offset = xstate_offsets[i];
254         }
255 }
256
257 static void __init print_xstate_feature(u64 xstate_mask)
258 {
259         const char *feature_name;
260
261         if (cpu_has_xfeatures(xstate_mask, &feature_name))
262                 pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
263 }
264
265 /*
266  * Print out all the supported xstate features:
267  */
268 static void __init print_xstate_features(void)
269 {
270         print_xstate_feature(XFEATURE_MASK_FP);
271         print_xstate_feature(XFEATURE_MASK_SSE);
272         print_xstate_feature(XFEATURE_MASK_YMM);
273         print_xstate_feature(XFEATURE_MASK_BNDREGS);
274         print_xstate_feature(XFEATURE_MASK_BNDCSR);
275         print_xstate_feature(XFEATURE_MASK_OPMASK);
276         print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
277         print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
278         print_xstate_feature(XFEATURE_MASK_PKRU);
279         print_xstate_feature(XFEATURE_MASK_PASID);
280         print_xstate_feature(XFEATURE_MASK_CET_USER);
281         print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
282         print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
283 }
284
285 /*
286  * This check is important because it is easy to get XSTATE_*
287  * confused with XSTATE_BIT_*.
288  */
289 #define CHECK_XFEATURE(nr) do {         \
290         WARN_ON(nr < FIRST_EXTENDED_XFEATURE);  \
291         WARN_ON(nr >= XFEATURE_MAX);    \
292 } while (0)
293
294 /*
295  * Print out xstate component offsets and sizes
296  */
297 static void __init print_xstate_offset_size(void)
298 {
299         int i;
300
301         for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
302                 pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
303                         i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
304                         i, xstate_sizes[i]);
305         }
306 }
307
308 /*
309  * This function is called only during boot time when x86 caps are not set
310  * up and alternative can not be used yet.
311  */
312 static __init void os_xrstor_booting(struct xregs_state *xstate)
313 {
314         u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
315         u32 lmask = mask;
316         u32 hmask = mask >> 32;
317         int err;
318
319         if (cpu_feature_enabled(X86_FEATURE_XSAVES))
320                 XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
321         else
322                 XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
323
324         /*
325          * We should never fault when copying from a kernel buffer, and the FPU
326          * state we set at boot time should be valid.
327          */
328         WARN_ON_FPU(err);
329 }
330
331 /*
332  * All supported features have either init state all zeros or are
333  * handled in setup_init_fpu() individually. This is an explicit
334  * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
335  * newly added supported features at build time and make people
336  * actually look at the init state for the new feature.
337  */
338 #define XFEATURES_INIT_FPSTATE_HANDLED          \
339         (XFEATURE_MASK_FP |                     \
340          XFEATURE_MASK_SSE |                    \
341          XFEATURE_MASK_YMM |                    \
342          XFEATURE_MASK_OPMASK |                 \
343          XFEATURE_MASK_ZMM_Hi256 |              \
344          XFEATURE_MASK_Hi16_ZMM  |              \
345          XFEATURE_MASK_PKRU |                   \
346          XFEATURE_MASK_BNDREGS |                \
347          XFEATURE_MASK_BNDCSR |                 \
348          XFEATURE_MASK_PASID |                  \
349          XFEATURE_MASK_CET_USER |               \
350          XFEATURE_MASK_XTILE)
351
352 /*
353  * setup the xstate image representing the init state
354  */
355 static void __init setup_init_fpu_buf(void)
356 {
357         BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
358                       XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
359                      XFEATURES_INIT_FPSTATE_HANDLED);
360
361         if (!boot_cpu_has(X86_FEATURE_XSAVE))
362                 return;
363
364         print_xstate_features();
365
366         xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
367
368         /*
369          * Init all the features state with header.xfeatures being 0x0
370          */
371         os_xrstor_booting(&init_fpstate.regs.xsave);
372
373         /*
374          * All components are now in init state. Read the state back so
375          * that init_fpstate contains all non-zero init state. This only
376          * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
377          * those use the init optimization which skips writing data for
378          * components in init state.
379          *
380          * XSAVE could be used, but that would require to reshuffle the
381          * data when XSAVEC/S is available because XSAVEC/S uses xstate
382          * compaction. But doing so is a pointless exercise because most
383          * components have an all zeros init state except for the legacy
384          * ones (FP and SSE). Those can be saved with FXSAVE into the
385          * legacy area. Adding new features requires to ensure that init
386          * state is all zeroes or if not to add the necessary handling
387          * here.
388          */
389         fxsave(&init_fpstate.regs.fxsave);
390 }
391
392 int xfeature_size(int xfeature_nr)
393 {
394         u32 eax, ebx, ecx, edx;
395
396         CHECK_XFEATURE(xfeature_nr);
397         cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
398         return eax;
399 }
400
401 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
402 static int validate_user_xstate_header(const struct xstate_header *hdr,
403                                        struct fpstate *fpstate)
404 {
405         /* No unknown or supervisor features may be set */
406         if (hdr->xfeatures & ~fpstate->user_xfeatures)
407                 return -EINVAL;
408
409         /* Userspace must use the uncompacted format */
410         if (hdr->xcomp_bv)
411                 return -EINVAL;
412
413         /*
414          * If 'reserved' is shrunken to add a new field, make sure to validate
415          * that new field here!
416          */
417         BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
418
419         /* No reserved bits may be set */
420         if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
421                 return -EINVAL;
422
423         return 0;
424 }
425
426 static void __init __xstate_dump_leaves(void)
427 {
428         int i;
429         u32 eax, ebx, ecx, edx;
430         static int should_dump = 1;
431
432         if (!should_dump)
433                 return;
434         should_dump = 0;
435         /*
436          * Dump out a few leaves past the ones that we support
437          * just in case there are some goodies up there
438          */
439         for (i = 0; i < XFEATURE_MAX + 10; i++) {
440                 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
441                 pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
442                         XSTATE_CPUID, i, eax, ebx, ecx, edx);
443         }
444 }
445
446 #define XSTATE_WARN_ON(x, fmt, ...) do {                                        \
447         if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) {   \
448                 __xstate_dump_leaves();                                         \
449         }                                                                       \
450 } while (0)
451
452 #define XCHECK_SZ(sz, nr, __struct) ({                                  \
453         if (WARN_ONCE(sz != sizeof(__struct),                           \
454             "[%s]: struct is %zu bytes, cpu state %d bytes\n",          \
455             xfeature_names[nr], sizeof(__struct), sz)) {                \
456                 __xstate_dump_leaves();                                 \
457         }                                                               \
458         true;                                                           \
459 })
460
461
462 /**
463  * check_xtile_data_against_struct - Check tile data state size.
464  *
465  * Calculate the state size by multiplying the single tile size which is
466  * recorded in a C struct, and the number of tiles that the CPU informs.
467  * Compare the provided size with the calculation.
468  *
469  * @size:       The tile data state size
470  *
471  * Returns:     0 on success, -EINVAL on mismatch.
472  */
473 static int __init check_xtile_data_against_struct(int size)
474 {
475         u32 max_palid, palid, state_size;
476         u32 eax, ebx, ecx, edx;
477         u16 max_tile;
478
479         /*
480          * Check the maximum palette id:
481          *   eax: the highest numbered palette subleaf.
482          */
483         cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
484
485         /*
486          * Cross-check each tile size and find the maximum number of
487          * supported tiles.
488          */
489         for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
490                 u16 tile_size, max;
491
492                 /*
493                  * Check the tile size info:
494                  *   eax[31:16]:  bytes per title
495                  *   ebx[31:16]:  the max names (or max number of tiles)
496                  */
497                 cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
498                 tile_size = eax >> 16;
499                 max = ebx >> 16;
500
501                 if (tile_size != sizeof(struct xtile_data)) {
502                         pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
503                                __stringify(XFEATURE_XTILE_DATA),
504                                sizeof(struct xtile_data), tile_size);
505                         __xstate_dump_leaves();
506                         return -EINVAL;
507                 }
508
509                 if (max > max_tile)
510                         max_tile = max;
511         }
512
513         state_size = sizeof(struct xtile_data) * max_tile;
514         if (size != state_size) {
515                 pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
516                        __stringify(XFEATURE_XTILE_DATA), state_size, size);
517                 __xstate_dump_leaves();
518                 return -EINVAL;
519         }
520         return 0;
521 }
522
523 /*
524  * We have a C struct for each 'xstate'.  We need to ensure
525  * that our software representation matches what the CPU
526  * tells us about the state's size.
527  */
528 static bool __init check_xstate_against_struct(int nr)
529 {
530         /*
531          * Ask the CPU for the size of the state.
532          */
533         int sz = xfeature_size(nr);
534
535         /*
536          * Match each CPU state with the corresponding software
537          * structure.
538          */
539         switch (nr) {
540         case XFEATURE_YMM:        return XCHECK_SZ(sz, nr, struct ymmh_struct);
541         case XFEATURE_BNDREGS:    return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
542         case XFEATURE_BNDCSR:     return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
543         case XFEATURE_OPMASK:     return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
544         case XFEATURE_ZMM_Hi256:  return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
545         case XFEATURE_Hi16_ZMM:   return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
546         case XFEATURE_PKRU:       return XCHECK_SZ(sz, nr, struct pkru_state);
547         case XFEATURE_PASID:      return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
548         case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
549         case XFEATURE_CET_USER:   return XCHECK_SZ(sz, nr, struct cet_user_state);
550         case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
551         default:
552                 XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
553                 return false;
554         }
555
556         return true;
557 }
558
559 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
560 {
561         unsigned int topmost = fls64(xfeatures) -  1;
562         unsigned int offset = xstate_offsets[topmost];
563
564         if (topmost <= XFEATURE_SSE)
565                 return sizeof(struct xregs_state);
566
567         if (compacted)
568                 offset = xfeature_get_offset(xfeatures, topmost);
569         return offset + xstate_sizes[topmost];
570 }
571
572 /*
573  * This essentially double-checks what the cpu told us about
574  * how large the XSAVE buffer needs to be.  We are recalculating
575  * it to be safe.
576  *
577  * Independent XSAVE features allocate their own buffers and are not
578  * covered by these checks. Only the size of the buffer for task->fpu
579  * is checked here.
580  */
581 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
582 {
583         bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
584         bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
585         unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
586         int i;
587
588         for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
589                 if (!check_xstate_against_struct(i))
590                         return false;
591                 /*
592                  * Supervisor state components can be managed only by
593                  * XSAVES.
594                  */
595                 if (!xsaves && xfeature_is_supervisor(i)) {
596                         XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
597                         return false;
598                 }
599         }
600         size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
601         XSTATE_WARN_ON(size != kernel_size,
602                        "size %u != kernel_size %u\n", size, kernel_size);
603         return size == kernel_size;
604 }
605
606 /*
607  * Get total size of enabled xstates in XCR0 | IA32_XSS.
608  *
609  * Note the SDM's wording here.  "sub-function 0" only enumerates
610  * the size of the *user* states.  If we use it to size a buffer
611  * that we use 'XSAVES' on, we could potentially overflow the
612  * buffer because 'XSAVES' saves system states too.
613  *
614  * This also takes compaction into account. So this works for
615  * XSAVEC as well.
616  */
617 static unsigned int __init get_compacted_size(void)
618 {
619         unsigned int eax, ebx, ecx, edx;
620         /*
621          * - CPUID function 0DH, sub-function 1:
622          *    EBX enumerates the size (in bytes) required by
623          *    the XSAVES instruction for an XSAVE area
624          *    containing all the state components
625          *    corresponding to bits currently set in
626          *    XCR0 | IA32_XSS.
627          *
628          * When XSAVES is not available but XSAVEC is (virt), then there
629          * are no supervisor states, but XSAVEC still uses compacted
630          * format.
631          */
632         cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
633         return ebx;
634 }
635
636 /*
637  * Get the total size of the enabled xstates without the independent supervisor
638  * features.
639  */
640 static unsigned int __init get_xsave_compacted_size(void)
641 {
642         u64 mask = xfeatures_mask_independent();
643         unsigned int size;
644
645         if (!mask)
646                 return get_compacted_size();
647
648         /* Disable independent features. */
649         wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
650
651         /*
652          * Ask the hardware what size is required of the buffer.
653          * This is the size required for the task->fpu buffer.
654          */
655         size = get_compacted_size();
656
657         /* Re-enable independent features so XSAVES will work on them again. */
658         wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
659
660         return size;
661 }
662
663 static unsigned int __init get_xsave_size_user(void)
664 {
665         unsigned int eax, ebx, ecx, edx;
666         /*
667          * - CPUID function 0DH, sub-function 0:
668          *    EBX enumerates the size (in bytes) required by
669          *    the XSAVE instruction for an XSAVE area
670          *    containing all the *user* state components
671          *    corresponding to bits currently set in XCR0.
672          */
673         cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
674         return ebx;
675 }
676
677 static int __init init_xstate_size(void)
678 {
679         /* Recompute the context size for enabled features: */
680         unsigned int user_size, kernel_size, kernel_default_size;
681         bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
682
683         /* Uncompacted user space size */
684         user_size = get_xsave_size_user();
685
686         /*
687          * XSAVES kernel size includes supervisor states and uses compacted
688          * format. XSAVEC uses compacted format, but does not save
689          * supervisor states.
690          *
691          * XSAVE[OPT] do not support supervisor states so kernel and user
692          * size is identical.
693          */
694         if (compacted)
695                 kernel_size = get_xsave_compacted_size();
696         else
697                 kernel_size = user_size;
698
699         kernel_default_size =
700                 xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
701
702         if (!paranoid_xstate_size_valid(kernel_size))
703                 return -EINVAL;
704
705         fpu_kernel_cfg.max_size = kernel_size;
706         fpu_user_cfg.max_size = user_size;
707
708         fpu_kernel_cfg.default_size = kernel_default_size;
709         fpu_user_cfg.default_size =
710                 xstate_calculate_size(fpu_user_cfg.default_features, false);
711
712         return 0;
713 }
714
715 /*
716  * We enabled the XSAVE hardware, but something went wrong and
717  * we can not use it.  Disable it.
718  */
719 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
720 {
721         fpu_kernel_cfg.max_features = 0;
722         cr4_clear_bits(X86_CR4_OSXSAVE);
723         setup_clear_cpu_cap(X86_FEATURE_XSAVE);
724
725         /* Restore the legacy size.*/
726         fpu_kernel_cfg.max_size = legacy_size;
727         fpu_kernel_cfg.default_size = legacy_size;
728         fpu_user_cfg.max_size = legacy_size;
729         fpu_user_cfg.default_size = legacy_size;
730
731         /*
732          * Prevent enabling the static branch which enables writes to the
733          * XFD MSR.
734          */
735         init_fpstate.xfd = 0;
736
737         fpstate_reset(&current->thread.fpu);
738 }
739
740 /*
741  * Enable and initialize the xsave feature.
742  * Called once per system bootup.
743  */
744 void __init fpu__init_system_xstate(unsigned int legacy_size)
745 {
746         unsigned int eax, ebx, ecx, edx;
747         u64 xfeatures;
748         int err;
749         int i;
750
751         if (!boot_cpu_has(X86_FEATURE_FPU)) {
752                 pr_info("x86/fpu: No FPU detected\n");
753                 return;
754         }
755
756         if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
757                 pr_info("x86/fpu: x87 FPU will use %s\n",
758                         boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
759                 return;
760         }
761
762         if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
763                 WARN_ON_FPU(1);
764                 return;
765         }
766
767         /*
768          * Find user xstates supported by the processor.
769          */
770         cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
771         fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
772
773         /*
774          * Find supervisor xstates supported by the processor.
775          */
776         cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
777         fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
778
779         if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
780                 /*
781                  * This indicates that something really unexpected happened
782                  * with the enumeration.  Disable XSAVE and try to continue
783                  * booting without it.  This is too early to BUG().
784                  */
785                 pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
786                        fpu_kernel_cfg.max_features);
787                 goto out_disable;
788         }
789
790         /*
791          * Clear XSAVE features that are disabled in the normal CPUID.
792          */
793         for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
794                 unsigned short cid = xsave_cpuid_features[i];
795
796                 /* Careful: X86_FEATURE_FPU is 0! */
797                 if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
798                         fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
799         }
800
801         if (!cpu_feature_enabled(X86_FEATURE_XFD))
802                 fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
803
804         if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
805                 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
806         else
807                 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
808                                         XFEATURE_MASK_SUPERVISOR_SUPPORTED;
809
810         fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
811         fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
812
813         /* Clean out dynamic features from default */
814         fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
815         fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
816
817         fpu_user_cfg.default_features = fpu_user_cfg.max_features;
818         fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
819
820         /* Store it for paranoia check at the end */
821         xfeatures = fpu_kernel_cfg.max_features;
822
823         /*
824          * Initialize the default XFD state in initfp_state and enable the
825          * dynamic sizing mechanism if dynamic states are available.  The
826          * static key cannot be enabled here because this runs before
827          * jump_label_init(). This is delayed to an initcall.
828          */
829         init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
830
831         /* Set up compaction feature bit */
832         if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
833             cpu_feature_enabled(X86_FEATURE_XSAVES))
834                 setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
835
836         /* Enable xstate instructions to be able to continue with initialization: */
837         fpu__init_cpu_xstate();
838
839         /* Cache size, offset and flags for initialization */
840         setup_xstate_cache();
841
842         err = init_xstate_size();
843         if (err)
844                 goto out_disable;
845
846         /* Reset the state for the current task */
847         fpstate_reset(&current->thread.fpu);
848
849         /*
850          * Update info used for ptrace frames; use standard-format size and no
851          * supervisor xstates:
852          */
853         update_regset_xstate_info(fpu_user_cfg.max_size,
854                                   fpu_user_cfg.max_features);
855
856         /*
857          * init_fpstate excludes dynamic states as they are large but init
858          * state is zero.
859          */
860         init_fpstate.size               = fpu_kernel_cfg.default_size;
861         init_fpstate.xfeatures          = fpu_kernel_cfg.default_features;
862
863         if (init_fpstate.size > sizeof(init_fpstate.regs)) {
864                 pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n",
865                         sizeof(init_fpstate.regs), init_fpstate.size);
866                 goto out_disable;
867         }
868
869         setup_init_fpu_buf();
870
871         /*
872          * Paranoia check whether something in the setup modified the
873          * xfeatures mask.
874          */
875         if (xfeatures != fpu_kernel_cfg.max_features) {
876                 pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
877                        xfeatures, fpu_kernel_cfg.max_features);
878                 goto out_disable;
879         }
880
881         /*
882          * CPU capabilities initialization runs before FPU init. So
883          * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
884          * functional, set the feature bit so depending code works.
885          */
886         setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
887
888         print_xstate_offset_size();
889         pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
890                 fpu_kernel_cfg.max_features,
891                 fpu_kernel_cfg.max_size,
892                 boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
893         return;
894
895 out_disable:
896         /* something went wrong, try to boot without any XSAVE support */
897         fpu__init_disable_system_xstate(legacy_size);
898 }
899
900 /*
901  * Restore minimal FPU state after suspend:
902  */
903 void fpu__resume_cpu(void)
904 {
905         /*
906          * Restore XCR0 on xsave capable CPUs:
907          */
908         if (cpu_feature_enabled(X86_FEATURE_XSAVE))
909                 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
910
911         /*
912          * Restore IA32_XSS. The same CPUID bit enumerates support
913          * of XSAVES and MSR_IA32_XSS.
914          */
915         if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
916                 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
917                                      xfeatures_mask_independent());
918         }
919
920         if (fpu_state_size_dynamic())
921                 wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
922 }
923
924 /*
925  * Given an xstate feature nr, calculate where in the xsave
926  * buffer the state is.  Callers should ensure that the buffer
927  * is valid.
928  */
929 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
930 {
931         u64 xcomp_bv = xsave->header.xcomp_bv;
932
933         if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
934                 return NULL;
935
936         if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
937                 if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
938                         return NULL;
939         }
940
941         return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
942 }
943
944 /*
945  * Given the xsave area and a state inside, this function returns the
946  * address of the state.
947  *
948  * This is the API that is called to get xstate address in either
949  * standard format or compacted format of xsave area.
950  *
951  * Note that if there is no data for the field in the xsave buffer
952  * this will return NULL.
953  *
954  * Inputs:
955  *      xstate: the thread's storage area for all FPU data
956  *      xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
957  *      XFEATURE_SSE, etc...)
958  * Output:
959  *      address of the state in the xsave area, or NULL if the
960  *      field is not present in the xsave buffer.
961  */
962 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
963 {
964         /*
965          * Do we even *have* xsave state?
966          */
967         if (!boot_cpu_has(X86_FEATURE_XSAVE))
968                 return NULL;
969
970         /*
971          * We should not ever be requesting features that we
972          * have not enabled.
973          */
974         if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
975                 return NULL;
976
977         /*
978          * This assumes the last 'xsave*' instruction to
979          * have requested that 'xfeature_nr' be saved.
980          * If it did not, we might be seeing and old value
981          * of the field in the buffer.
982          *
983          * This can happen because the last 'xsave' did not
984          * request that this feature be saved (unlikely)
985          * or because the "init optimization" caused it
986          * to not be saved.
987          */
988         if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
989                 return NULL;
990
991         return __raw_xsave_addr(xsave, xfeature_nr);
992 }
993
994 #ifdef CONFIG_ARCH_HAS_PKEYS
995
996 /*
997  * This will go out and modify PKRU register to set the access
998  * rights for @pkey to @init_val.
999  */
1000 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
1001                               unsigned long init_val)
1002 {
1003         u32 old_pkru, new_pkru_bits = 0;
1004         int pkey_shift;
1005
1006         /*
1007          * This check implies XSAVE support.  OSPKE only gets
1008          * set if we enable XSAVE and we enable PKU in XCR0.
1009          */
1010         if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1011                 return -EINVAL;
1012
1013         /*
1014          * This code should only be called with valid 'pkey'
1015          * values originating from in-kernel users.  Complain
1016          * if a bad value is observed.
1017          */
1018         if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1019                 return -EINVAL;
1020
1021         /* Set the bits we need in PKRU:  */
1022         if (init_val & PKEY_DISABLE_ACCESS)
1023                 new_pkru_bits |= PKRU_AD_BIT;
1024         if (init_val & PKEY_DISABLE_WRITE)
1025                 new_pkru_bits |= PKRU_WD_BIT;
1026
1027         /* Shift the bits in to the correct place in PKRU for pkey: */
1028         pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1029         new_pkru_bits <<= pkey_shift;
1030
1031         /* Get old PKRU and mask off any old bits in place: */
1032         old_pkru = read_pkru();
1033         old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1034
1035         /* Write old part along with new part: */
1036         write_pkru(old_pkru | new_pkru_bits);
1037
1038         return 0;
1039 }
1040 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
1041
1042 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1043                          void *init_xstate, unsigned int size)
1044 {
1045         membuf_write(to, from_xstate ? xstate : init_xstate, size);
1046 }
1047
1048 /**
1049  * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1050  * @to:         membuf descriptor
1051  * @fpstate:    The fpstate buffer from which to copy
1052  * @xfeatures:  The mask of xfeatures to save (XSAVE mode only)
1053  * @pkru_val:   The PKRU value to store in the PKRU component
1054  * @copy_mode:  The requested copy mode
1055  *
1056  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1057  * format, i.e. from the kernel internal hardware dependent storage format
1058  * to the requested @mode. UABI XSTATE is always uncompacted!
1059  *
1060  * It supports partial copy but @to.pos always starts from zero.
1061  */
1062 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1063                                u64 xfeatures, u32 pkru_val,
1064                                enum xstate_copy_mode copy_mode)
1065 {
1066         const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1067         struct xregs_state *xinit = &init_fpstate.regs.xsave;
1068         struct xregs_state *xsave = &fpstate->regs.xsave;
1069         struct xstate_header header;
1070         unsigned int zerofrom;
1071         u64 mask;
1072         int i;
1073
1074         memset(&header, 0, sizeof(header));
1075         header.xfeatures = xsave->header.xfeatures;
1076
1077         /* Mask out the feature bits depending on copy mode */
1078         switch (copy_mode) {
1079         case XSTATE_COPY_FP:
1080                 header.xfeatures &= XFEATURE_MASK_FP;
1081                 break;
1082
1083         case XSTATE_COPY_FX:
1084                 header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1085                 break;
1086
1087         case XSTATE_COPY_XSAVE:
1088                 header.xfeatures &= fpstate->user_xfeatures & xfeatures;
1089                 break;
1090         }
1091
1092         /* Copy FP state up to MXCSR */
1093         copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1094                      &xinit->i387, off_mxcsr);
1095
1096         /* Copy MXCSR when SSE or YMM are set in the feature mask */
1097         copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1098                      &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1099                      MXCSR_AND_FLAGS_SIZE);
1100
1101         /* Copy the remaining FP state */
1102         copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1103                      &to, &xsave->i387.st_space, &xinit->i387.st_space,
1104                      sizeof(xsave->i387.st_space));
1105
1106         /* Copy the SSE state - shared with YMM, but independently managed */
1107         copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1108                      &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1109                      sizeof(xsave->i387.xmm_space));
1110
1111         if (copy_mode != XSTATE_COPY_XSAVE)
1112                 goto out;
1113
1114         /* Zero the padding area */
1115         membuf_zero(&to, sizeof(xsave->i387.padding));
1116
1117         /* Copy xsave->i387.sw_reserved */
1118         membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1119
1120         /* Copy the user space relevant state of @xsave->header */
1121         membuf_write(&to, &header, sizeof(header));
1122
1123         zerofrom = offsetof(struct xregs_state, extended_state_area);
1124
1125         /*
1126          * This 'mask' indicates which states to copy from fpstate.
1127          * Those extended states that are not present in fpstate are
1128          * either disabled or initialized:
1129          *
1130          * In non-compacted format, disabled features still occupy
1131          * state space but there is no state to copy from in the
1132          * compacted init_fpstate. The gap tracking will zero these
1133          * states.
1134          *
1135          * The extended features have an all zeroes init state. Thus,
1136          * remove them from 'mask' to zero those features in the user
1137          * buffer instead of retrieving them from init_fpstate.
1138          */
1139         mask = header.xfeatures;
1140
1141         for_each_extended_xfeature(i, mask) {
1142                 /*
1143                  * If there was a feature or alignment gap, zero the space
1144                  * in the destination buffer.
1145                  */
1146                 if (zerofrom < xstate_offsets[i])
1147                         membuf_zero(&to, xstate_offsets[i] - zerofrom);
1148
1149                 if (i == XFEATURE_PKRU) {
1150                         struct pkru_state pkru = {0};
1151                         /*
1152                          * PKRU is not necessarily up to date in the
1153                          * XSAVE buffer. Use the provided value.
1154                          */
1155                         pkru.pkru = pkru_val;
1156                         membuf_write(&to, &pkru, sizeof(pkru));
1157                 } else {
1158                         membuf_write(&to,
1159                                      __raw_xsave_addr(xsave, i),
1160                                      xstate_sizes[i]);
1161                 }
1162                 /*
1163                  * Keep track of the last copied state in the non-compacted
1164                  * target buffer for gap zeroing.
1165                  */
1166                 zerofrom = xstate_offsets[i] + xstate_sizes[i];
1167         }
1168
1169 out:
1170         if (to.left)
1171                 membuf_zero(&to, to.left);
1172 }
1173
1174 /**
1175  * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1176  * @to:         membuf descriptor
1177  * @tsk:        The task from which to copy the saved xstate
1178  * @copy_mode:  The requested copy mode
1179  *
1180  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1181  * format, i.e. from the kernel internal hardware dependent storage format
1182  * to the requested @mode. UABI XSTATE is always uncompacted!
1183  *
1184  * It supports partial copy but @to.pos always starts from zero.
1185  */
1186 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1187                              enum xstate_copy_mode copy_mode)
1188 {
1189         __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
1190                                   tsk->thread.fpu.fpstate->user_xfeatures,
1191                                   tsk->thread.pkru, copy_mode);
1192 }
1193
1194 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1195                             const void *kbuf, const void __user *ubuf)
1196 {
1197         if (kbuf) {
1198                 memcpy(dst, kbuf + offset, size);
1199         } else {
1200                 if (copy_from_user(dst, ubuf + offset, size))
1201                         return -EFAULT;
1202         }
1203         return 0;
1204 }
1205
1206
1207 /**
1208  * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
1209  * @fpstate:    The fpstate buffer to copy to
1210  * @kbuf:       The UABI format buffer, if it comes from the kernel
1211  * @ubuf:       The UABI format buffer, if it comes from userspace
1212  * @pkru:       The location to write the PKRU value to
1213  *
1214  * Converts from the UABI format into the kernel internal hardware
1215  * dependent format.
1216  *
1217  * This function ultimately has three different callers with distinct PKRU
1218  * behavior.
1219  * 1.   When called from sigreturn the PKRU register will be restored from
1220  *      @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
1221  *      @fpstate is sufficient to cover this case, but the caller will also
1222  *      pass a pointer to the thread_struct's pkru field in @pkru and updating
1223  *      it is harmless.
1224  * 2.   When called from ptrace the PKRU register will be restored from the
1225  *      thread_struct's pkru field. A pointer to that is passed in @pkru.
1226  *      The kernel will restore it manually, so the XRSTOR behavior that resets
1227  *      the PKRU register to the hardware init value (0) if the corresponding
1228  *      xfeatures bit is not set is emulated here.
1229  * 3.   When called from KVM the PKRU register will be restored from the vcpu's
1230  *      pkru field. A pointer to that is passed in @pkru. KVM hasn't used
1231  *      XRSTOR and hasn't had the PKRU resetting behavior described above. To
1232  *      preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
1233  *      bit is not set.
1234  */
1235 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1236                                const void __user *ubuf, u32 *pkru)
1237 {
1238         struct xregs_state *xsave = &fpstate->regs.xsave;
1239         unsigned int offset, size;
1240         struct xstate_header hdr;
1241         u64 mask;
1242         int i;
1243
1244         offset = offsetof(struct xregs_state, header);
1245         if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1246                 return -EFAULT;
1247
1248         if (validate_user_xstate_header(&hdr, fpstate))
1249                 return -EINVAL;
1250
1251         /* Validate MXCSR when any of the related features is in use */
1252         mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1253         if (hdr.xfeatures & mask) {
1254                 u32 mxcsr[2];
1255
1256                 offset = offsetof(struct fxregs_state, mxcsr);
1257                 if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1258                         return -EFAULT;
1259
1260                 /* Reserved bits in MXCSR must be zero. */
1261                 if (mxcsr[0] & ~mxcsr_feature_mask)
1262                         return -EINVAL;
1263
1264                 /* SSE and YMM require MXCSR even when FP is not in use. */
1265                 if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1266                         xsave->i387.mxcsr = mxcsr[0];
1267                         xsave->i387.mxcsr_mask = mxcsr[1];
1268                 }
1269         }
1270
1271         for (i = 0; i < XFEATURE_MAX; i++) {
1272                 mask = BIT_ULL(i);
1273
1274                 if (hdr.xfeatures & mask) {
1275                         void *dst = __raw_xsave_addr(xsave, i);
1276
1277                         offset = xstate_offsets[i];
1278                         size = xstate_sizes[i];
1279
1280                         if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1281                                 return -EFAULT;
1282                 }
1283         }
1284
1285         if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
1286                 struct pkru_state *xpkru;
1287
1288                 xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
1289                 *pkru = xpkru->pkru;
1290         } else {
1291                 /*
1292                  * KVM may pass NULL here to indicate that it does not need
1293                  * PKRU updated.
1294                  */
1295                 if (pkru)
1296                         *pkru = 0;
1297         }
1298
1299         /*
1300          * The state that came in from userspace was user-state only.
1301          * Mask all the user states out of 'xfeatures':
1302          */
1303         xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1304
1305         /*
1306          * Add back in the features that came in from userspace:
1307          */
1308         xsave->header.xfeatures |= hdr.xfeatures;
1309
1310         return 0;
1311 }
1312
1313 /*
1314  * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1315  * format and copy to the target thread. Used by ptrace and KVM.
1316  */
1317 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
1318 {
1319         return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
1320 }
1321
1322 /*
1323  * Convert from a sigreturn standard-format user-space buffer to kernel
1324  * XSAVE[S] format and copy to the target thread. This is called from the
1325  * sigreturn() and rt_sigreturn() system calls.
1326  */
1327 int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
1328                                       const void __user *ubuf)
1329 {
1330         return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
1331 }
1332
1333 static bool validate_independent_components(u64 mask)
1334 {
1335         u64 xchk;
1336
1337         if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1338                 return false;
1339
1340         xchk = ~xfeatures_mask_independent();
1341
1342         if (WARN_ON_ONCE(!mask || mask & xchk))
1343                 return false;
1344
1345         return true;
1346 }
1347
1348 /**
1349  * xsaves - Save selected components to a kernel xstate buffer
1350  * @xstate:     Pointer to the buffer
1351  * @mask:       Feature mask to select the components to save
1352  *
1353  * The @xstate buffer must be 64 byte aligned and correctly initialized as
1354  * XSAVES does not write the full xstate header. Before first use the
1355  * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1356  * can #GP.
1357  *
1358  * The feature mask must be a subset of the independent features.
1359  */
1360 void xsaves(struct xregs_state *xstate, u64 mask)
1361 {
1362         int err;
1363
1364         if (!validate_independent_components(mask))
1365                 return;
1366
1367         XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1368         WARN_ON_ONCE(err);
1369 }
1370
1371 /**
1372  * xrstors - Restore selected components from a kernel xstate buffer
1373  * @xstate:     Pointer to the buffer
1374  * @mask:       Feature mask to select the components to restore
1375  *
1376  * The @xstate buffer must be 64 byte aligned and correctly initialized
1377  * otherwise XRSTORS from that buffer can #GP.
1378  *
1379  * Proper usage is to restore the state which was saved with
1380  * xsaves() into @xstate.
1381  *
1382  * The feature mask must be a subset of the independent features.
1383  */
1384 void xrstors(struct xregs_state *xstate, u64 mask)
1385 {
1386         int err;
1387
1388         if (!validate_independent_components(mask))
1389                 return;
1390
1391         XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1392         WARN_ON_ONCE(err);
1393 }
1394
1395 #if IS_ENABLED(CONFIG_KVM)
1396 void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
1397 {
1398         void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
1399
1400         if (addr)
1401                 memset(addr, 0, xstate_sizes[xfeature]);
1402 }
1403 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
1404 #endif
1405
1406 #ifdef CONFIG_X86_64
1407
1408 #ifdef CONFIG_X86_DEBUG_FPU
1409 /*
1410  * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1411  * can safely operate on the @fpstate buffer.
1412  */
1413 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1414 {
1415         u64 xfd = __this_cpu_read(xfd_state);
1416
1417         if (fpstate->xfd == xfd)
1418                 return true;
1419
1420          /*
1421           * The XFD MSR does not match fpstate->xfd. That's invalid when
1422           * the passed in fpstate is current's fpstate.
1423           */
1424         if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
1425                 return false;
1426
1427         /*
1428          * XRSTOR(S) from init_fpstate are always correct as it will just
1429          * bring all components into init state and not read from the
1430          * buffer. XSAVE(S) raises #PF after init.
1431          */
1432         if (fpstate == &init_fpstate)
1433                 return rstor;
1434
1435         /*
1436          * XSAVE(S): clone(), fpu_swap_kvm_fpu()
1437          * XRSTORS(S): fpu_swap_kvm_fpu()
1438          */
1439
1440         /*
1441          * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1442          * the buffer area for XFD-disabled state components.
1443          */
1444         mask &= ~xfd;
1445
1446         /*
1447          * Remove features which are valid in fpstate. They
1448          * have space allocated in fpstate.
1449          */
1450         mask &= ~fpstate->xfeatures;
1451
1452         /*
1453          * Any remaining state components in 'mask' might be written
1454          * by XSAVE/XRSTOR. Fail validation it found.
1455          */
1456         return !mask;
1457 }
1458
1459 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1460 {
1461         WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1462 }
1463 #endif /* CONFIG_X86_DEBUG_FPU */
1464
1465 static int __init xfd_update_static_branch(void)
1466 {
1467         /*
1468          * If init_fpstate.xfd has bits set then dynamic features are
1469          * available and the dynamic sizing must be enabled.
1470          */
1471         if (init_fpstate.xfd)
1472                 static_branch_enable(&__fpu_state_size_dynamic);
1473         return 0;
1474 }
1475 arch_initcall(xfd_update_static_branch)
1476
1477 void fpstate_free(struct fpu *fpu)
1478 {
1479         if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1480                 vfree(fpu->fpstate);
1481 }
1482
1483 /**
1484  * fpstate_realloc - Reallocate struct fpstate for the requested new features
1485  *
1486  * @xfeatures:  A bitmap of xstate features which extend the enabled features
1487  *              of that task
1488  * @ksize:      The required size for the kernel buffer
1489  * @usize:      The required size for user space buffers
1490  * @guest_fpu:  Pointer to a guest FPU container. NULL for host allocations
1491  *
1492  * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1493  * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1494  * with large states are likely to live longer.
1495  *
1496  * Returns: 0 on success, -ENOMEM on allocation error.
1497  */
1498 static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1499                            unsigned int usize, struct fpu_guest *guest_fpu)
1500 {
1501         struct fpu *fpu = &current->thread.fpu;
1502         struct fpstate *curfps, *newfps = NULL;
1503         unsigned int fpsize;
1504         bool in_use;
1505
1506         fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1507
1508         newfps = vzalloc(fpsize);
1509         if (!newfps)
1510                 return -ENOMEM;
1511         newfps->size = ksize;
1512         newfps->user_size = usize;
1513         newfps->is_valloc = true;
1514
1515         /*
1516          * When a guest FPU is supplied, use @guest_fpu->fpstate
1517          * as reference independent whether it is in use or not.
1518          */
1519         curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
1520
1521         /* Determine whether @curfps is the active fpstate */
1522         in_use = fpu->fpstate == curfps;
1523
1524         if (guest_fpu) {
1525                 newfps->is_guest = true;
1526                 newfps->is_confidential = curfps->is_confidential;
1527                 newfps->in_use = curfps->in_use;
1528                 guest_fpu->xfeatures |= xfeatures;
1529                 guest_fpu->uabi_size = usize;
1530         }
1531
1532         fpregs_lock();
1533         /*
1534          * If @curfps is in use, ensure that the current state is in the
1535          * registers before swapping fpstate as that might invalidate it
1536          * due to layout changes.
1537          */
1538         if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
1539                 fpregs_restore_userregs();
1540
1541         newfps->xfeatures = curfps->xfeatures | xfeatures;
1542         newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1543         newfps->xfd = curfps->xfd & ~xfeatures;
1544
1545         /* Do the final updates within the locked region */
1546         xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1547
1548         if (guest_fpu) {
1549                 guest_fpu->fpstate = newfps;
1550                 /* If curfps is active, update the FPU fpstate pointer */
1551                 if (in_use)
1552                         fpu->fpstate = newfps;
1553         } else {
1554                 fpu->fpstate = newfps;
1555         }
1556
1557         if (in_use)
1558                 xfd_update_state(fpu->fpstate);
1559         fpregs_unlock();
1560
1561         /* Only free valloc'ed state */
1562         if (curfps && curfps->is_valloc)
1563                 vfree(curfps);
1564
1565         return 0;
1566 }
1567
1568 static int validate_sigaltstack(unsigned int usize)
1569 {
1570         struct task_struct *thread, *leader = current->group_leader;
1571         unsigned long framesize = get_sigframe_size();
1572
1573         lockdep_assert_held(&current->sighand->siglock);
1574
1575         /* get_sigframe_size() is based on fpu_user_cfg.max_size */
1576         framesize -= fpu_user_cfg.max_size;
1577         framesize += usize;
1578         for_each_thread(leader, thread) {
1579                 if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1580                         return -ENOSPC;
1581         }
1582         return 0;
1583 }
1584
1585 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
1586 {
1587         /*
1588          * This deliberately does not exclude !XSAVES as we still might
1589          * decide to optionally context switch XCR0 or talk the silicon
1590          * vendors into extending XFD for the pre AMX states, especially
1591          * AVX512.
1592          */
1593         bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
1594         struct fpu *fpu = &current->group_leader->thread.fpu;
1595         struct fpu_state_perm *perm;
1596         unsigned int ksize, usize;
1597         u64 mask;
1598         int ret = 0;
1599
1600         /* Check whether fully enabled */
1601         if ((permitted & requested) == requested)
1602                 return 0;
1603
1604         /* Calculate the resulting kernel state size */
1605         mask = permitted | requested;
1606         /* Take supervisor states into account on the host */
1607         if (!guest)
1608                 mask |= xfeatures_mask_supervisor();
1609         ksize = xstate_calculate_size(mask, compacted);
1610
1611         /* Calculate the resulting user state size */
1612         mask &= XFEATURE_MASK_USER_SUPPORTED;
1613         usize = xstate_calculate_size(mask, false);
1614
1615         if (!guest) {
1616                 ret = validate_sigaltstack(usize);
1617                 if (ret)
1618                         return ret;
1619         }
1620
1621         perm = guest ? &fpu->guest_perm : &fpu->perm;
1622         /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1623         WRITE_ONCE(perm->__state_perm, mask);
1624         /* Protected by sighand lock */
1625         perm->__state_size = ksize;
1626         perm->__user_state_size = usize;
1627         return ret;
1628 }
1629
1630 /*
1631  * Permissions array to map facilities with more than one component
1632  */
1633 static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1634         [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1635 };
1636
1637 static int xstate_request_perm(unsigned long idx, bool guest)
1638 {
1639         u64 permitted, requested;
1640         int ret;
1641
1642         if (idx >= XFEATURE_MAX)
1643                 return -EINVAL;
1644
1645         /*
1646          * Look up the facility mask which can require more than
1647          * one xstate component.
1648          */
1649         idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1650         requested = xstate_prctl_req[idx];
1651         if (!requested)
1652                 return -EOPNOTSUPP;
1653
1654         if ((fpu_user_cfg.max_features & requested) != requested)
1655                 return -EOPNOTSUPP;
1656
1657         /* Lockless quick check */
1658         permitted = xstate_get_group_perm(guest);
1659         if ((permitted & requested) == requested)
1660                 return 0;
1661
1662         /* Protect against concurrent modifications */
1663         spin_lock_irq(&current->sighand->siglock);
1664         permitted = xstate_get_group_perm(guest);
1665
1666         /* First vCPU allocation locks the permissions. */
1667         if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
1668                 ret = -EBUSY;
1669         else
1670                 ret = __xstate_request_perm(permitted, requested, guest);
1671         spin_unlock_irq(&current->sighand->siglock);
1672         return ret;
1673 }
1674
1675 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
1676 {
1677         u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1678         struct fpu_state_perm *perm;
1679         unsigned int ksize, usize;
1680         struct fpu *fpu;
1681
1682         if (!xfd_event) {
1683                 if (!guest_fpu)
1684                         pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1685                 return 0;
1686         }
1687
1688         /* Protect against concurrent modifications */
1689         spin_lock_irq(&current->sighand->siglock);
1690
1691         /* If not permitted let it die */
1692         if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
1693                 spin_unlock_irq(&current->sighand->siglock);
1694                 return -EPERM;
1695         }
1696
1697         fpu = &current->group_leader->thread.fpu;
1698         perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
1699         ksize = perm->__state_size;
1700         usize = perm->__user_state_size;
1701
1702         /*
1703          * The feature is permitted. State size is sufficient.  Dropping
1704          * the lock is safe here even if more features are added from
1705          * another task, the retrieved buffer sizes are valid for the
1706          * currently requested feature(s).
1707          */
1708         spin_unlock_irq(&current->sighand->siglock);
1709
1710         /*
1711          * Try to allocate a new fpstate. If that fails there is no way
1712          * out.
1713          */
1714         if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
1715                 return -EFAULT;
1716         return 0;
1717 }
1718
1719 int xfd_enable_feature(u64 xfd_err)
1720 {
1721         return __xfd_enable_feature(xfd_err, NULL);
1722 }
1723
1724 #else /* CONFIG_X86_64 */
1725 static inline int xstate_request_perm(unsigned long idx, bool guest)
1726 {
1727         return -EPERM;
1728 }
1729 #endif  /* !CONFIG_X86_64 */
1730
1731 u64 xstate_get_guest_group_perm(void)
1732 {
1733         return xstate_get_group_perm(true);
1734 }
1735 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
1736
1737 /**
1738  * fpu_xstate_prctl - xstate permission operations
1739  * @tsk:        Redundant pointer to current
1740  * @option:     A subfunction of arch_prctl()
1741  * @arg2:       option argument
1742  * Return:      0 if successful; otherwise, an error code
1743  *
1744  * Option arguments:
1745  *
1746  * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1747  * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1748  * ARCH_REQ_XCOMP_PERM: Facility number requested
1749  *
1750  * For facilities which require more than one XSTATE component, the request
1751  * must be the highest state component number related to that facility,
1752  * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1753  * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1754  */
1755 long fpu_xstate_prctl(int option, unsigned long arg2)
1756 {
1757         u64 __user *uptr = (u64 __user *)arg2;
1758         u64 permitted, supported;
1759         unsigned long idx = arg2;
1760         bool guest = false;
1761
1762         switch (option) {
1763         case ARCH_GET_XCOMP_SUPP:
1764                 supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
1765                 return put_user(supported, uptr);
1766
1767         case ARCH_GET_XCOMP_PERM:
1768                 /*
1769                  * Lockless snapshot as it can also change right after the
1770                  * dropping the lock.
1771                  */
1772                 permitted = xstate_get_host_group_perm();
1773                 permitted &= XFEATURE_MASK_USER_SUPPORTED;
1774                 return put_user(permitted, uptr);
1775
1776         case ARCH_GET_XCOMP_GUEST_PERM:
1777                 permitted = xstate_get_guest_group_perm();
1778                 permitted &= XFEATURE_MASK_USER_SUPPORTED;
1779                 return put_user(permitted, uptr);
1780
1781         case ARCH_REQ_XCOMP_GUEST_PERM:
1782                 guest = true;
1783                 fallthrough;
1784
1785         case ARCH_REQ_XCOMP_PERM:
1786                 if (!IS_ENABLED(CONFIG_X86_64))
1787                         return -EOPNOTSUPP;
1788
1789                 return xstate_request_perm(idx, guest);
1790
1791         default:
1792                 return -EINVAL;
1793         }
1794 }
1795
1796 #ifdef CONFIG_PROC_PID_ARCH_STATUS
1797 /*
1798  * Report the amount of time elapsed in millisecond since last AVX512
1799  * use in the task.
1800  */
1801 static void avx512_status(struct seq_file *m, struct task_struct *task)
1802 {
1803         unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
1804         long delta;
1805
1806         if (!timestamp) {
1807                 /*
1808                  * Report -1 if no AVX512 usage
1809                  */
1810                 delta = -1;
1811         } else {
1812                 delta = (long)(jiffies - timestamp);
1813                 /*
1814                  * Cap to LONG_MAX if time difference > LONG_MAX
1815                  */
1816                 if (delta < 0)
1817                         delta = LONG_MAX;
1818                 delta = jiffies_to_msecs(delta);
1819         }
1820
1821         seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1822         seq_putc(m, '\n');
1823 }
1824
1825 /*
1826  * Report architecture specific information
1827  */
1828 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1829                         struct pid *pid, struct task_struct *task)
1830 {
1831         /*
1832          * Report AVX512 state if the processor and build option supported.
1833          */
1834         if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1835                 avx512_status(m, task);
1836
1837         return 0;
1838 }
1839 #endif /* CONFIG_PROC_PID_ARCH_STATUS */