arch/x86/include/asm/xor_avx.h

   1 #ifndef _ASM_X86_XOR_AVX_H
   2 #define _ASM_X86_XOR_AVX_H
   3
   4 /*
   5  * Optimized RAID-5 checksumming functions for AVX
   6  *
   7  * Copyright (C) 2012 Intel Corporation
   8  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
   9  *
  10  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
  11  *
  12  * This program is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU General Public License
  14  * as published by the Free Software Foundation; version 2
  15  * of the License.
  16  */
  17
  18 #ifdef CONFIG_AS_AVX
  19
  20 #include <linux/compiler.h>
  21 #include <asm/i387.h>
  22
  23 #define ALIGN32 __aligned(32)
  24
  25 #define YMM_SAVED_REGS 4
  26
  27 #define YMMS_SAVE \
  28 do { \
  29         preempt_disable(); \
  30         cr0 = read_cr0(); \
  31         clts(); \
  32         asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
  33         asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
  34         asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
  35         asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
  36 } while (0);
  37
  38 #define YMMS_RESTORE \
  39 do { \
  40         asm volatile("sfence" : : : "memory"); \
  41         asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
  42         asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
  43         asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
  44         asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
  45         write_cr0(cr0); \
  46         preempt_enable(); \
  47 } while (0);
  48
  49 #define BLOCK4(i) \
  50                 BLOCK(32 * i, 0) \
  51                 BLOCK(32 * (i + 1), 1) \
  52                 BLOCK(32 * (i + 2), 2) \
  53                 BLOCK(32 * (i + 3), 3)
  54
  55 #define BLOCK16() \
  56                 BLOCK4(0) \
  57                 BLOCK4(4) \
  58                 BLOCK4(8) \
  59                 BLOCK4(12)
  60
  61 static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
  62 {
  63         unsigned long cr0, lines = bytes >> 9;
  64         char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
  65
  66         YMMS_SAVE
  67
  68         while (lines--) {
  69 #undef BLOCK
  70 #define BLOCK(i, reg) \
  71 do { \
  72         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
  73         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
  74                 "m" (p0[i / sizeof(*p0)])); \
  75         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
  76                 "=m" (p0[i / sizeof(*p0)])); \
  77 } while (0);
  78
  79                 BLOCK16()
  80
  81                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
  82                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
  83         }
  84
  85         YMMS_RESTORE
  86 }
  87
  88 static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
  89         unsigned long *p2)
  90 {
  91         unsigned long cr0, lines = bytes >> 9;
  92         char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
  93
  94         YMMS_SAVE
  95
  96         while (lines--) {
  97 #undef BLOCK
  98 #define BLOCK(i, reg) \
  99 do { \
 100         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
 101         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 102                 "m" (p1[i / sizeof(*p1)])); \
 103         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 104                 "m" (p0[i / sizeof(*p0)])); \
 105         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 106                 "=m" (p0[i / sizeof(*p0)])); \
 107 } while (0);
 108
 109                 BLOCK16()
 110
 111                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
 112                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
 113                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
 114         }
 115
 116         YMMS_RESTORE
 117 }
 118
 119 static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
 120         unsigned long *p2, unsigned long *p3)
 121 {
 122         unsigned long cr0, lines = bytes >> 9;
 123         char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
 124
 125         YMMS_SAVE
 126
 127         while (lines--) {
 128 #undef BLOCK
 129 #define BLOCK(i, reg) \
 130 do { \
 131         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
 132         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 133                 "m" (p2[i / sizeof(*p2)])); \
 134         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 135                 "m" (p1[i / sizeof(*p1)])); \
 136         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 137                 "m" (p0[i / sizeof(*p0)])); \
 138         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 139                 "=m" (p0[i / sizeof(*p0)])); \
 140 } while (0);
 141
 142                 BLOCK16();
 143
 144                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
 145                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
 146                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
 147                 p3 = (unsigned long *)((uintptr_t)p3 + 512);
 148         }
 149
 150         YMMS_RESTORE
 151 }
 152
 153 static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
 154         unsigned long *p2, unsigned long *p3, unsigned long *p4)
 155 {
 156         unsigned long cr0, lines = bytes >> 9;
 157         char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
 158
 159         YMMS_SAVE
 160
 161         while (lines--) {
 162 #undef BLOCK
 163 #define BLOCK(i, reg) \
 164 do { \
 165         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
 166         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 167                 "m" (p3[i / sizeof(*p3)])); \
 168         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 169                 "m" (p2[i / sizeof(*p2)])); \
 170         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 171                 "m" (p1[i / sizeof(*p1)])); \
 172         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 173                 "m" (p0[i / sizeof(*p0)])); \
 174         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 175                 "=m" (p0[i / sizeof(*p0)])); \
 176 } while (0);
 177
 178                 BLOCK16()
 179
 180                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
 181                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
 182                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
 183                 p3 = (unsigned long *)((uintptr_t)p3 + 512);
 184                 p4 = (unsigned long *)((uintptr_t)p4 + 512);
 185         }
 186
 187         YMMS_RESTORE
 188 }
 189
 190 static struct xor_block_template xor_block_avx = {
 191         .name = "avx",
 192         .do_2 = xor_avx_2,
 193         .do_3 = xor_avx_3,
 194         .do_4 = xor_avx_4,
 195         .do_5 = xor_avx_5,
 196 };
 197
 198 #define AVX_XOR_SPEED \
 199 do { \
 200         if (cpu_has_avx) \
 201                 xor_speed(&xor_block_avx); \
 202 } while (0)
 203
 204 #define AVX_SELECT(FASTEST) \
 205         (cpu_has_avx ? &xor_block_avx : FASTEST)
 206
 207 #else
 208
 209 #define AVX_XOR_SPEED {}
 210
 211 #define AVX_SELECT(FASTEST) (FASTEST)
 212
 213 #endif
 214 #endif