1 #ifndef _ASM_X86_XOR_AVX_H
2 #define _ASM_X86_XOR_AVX_H
5 * Optimized RAID-5 checksumming functions for AVX
7 * Copyright (C) 2012 Intel Corporation
8 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
10 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; version 2
20 #include <linux/compiler.h>
23 #define ALIGN32 __aligned(32)
25 #define YMM_SAVED_REGS 4
32 asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
33 asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
34 asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
35 asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
38 #define YMMS_RESTORE \
40 asm volatile("sfence" : : : "memory"); \
41 asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
42 asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
43 asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
44 asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
51 BLOCK(32 * (i + 1), 1) \
52 BLOCK(32 * (i + 2), 2) \
53 BLOCK(32 * (i + 3), 3)
61 static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
63 unsigned long cr0, lines = bytes >> 9;
64 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
70 #define BLOCK(i, reg) \
72 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
73 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
74 "m" (p0[i / sizeof(*p0)])); \
75 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
76 "=m" (p0[i / sizeof(*p0)])); \
81 p0 = (unsigned long *)((uintptr_t)p0 + 512);
82 p1 = (unsigned long *)((uintptr_t)p1 + 512);
88 static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
91 unsigned long cr0, lines = bytes >> 9;
92 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
98 #define BLOCK(i, reg) \
100 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
101 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102 "m" (p1[i / sizeof(*p1)])); \
103 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104 "m" (p0[i / sizeof(*p0)])); \
105 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
106 "=m" (p0[i / sizeof(*p0)])); \
111 p0 = (unsigned long *)((uintptr_t)p0 + 512);
112 p1 = (unsigned long *)((uintptr_t)p1 + 512);
113 p2 = (unsigned long *)((uintptr_t)p2 + 512);
119 static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
120 unsigned long *p2, unsigned long *p3)
122 unsigned long cr0, lines = bytes >> 9;
123 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
129 #define BLOCK(i, reg) \
131 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
132 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
133 "m" (p2[i / sizeof(*p2)])); \
134 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
135 "m" (p1[i / sizeof(*p1)])); \
136 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137 "m" (p0[i / sizeof(*p0)])); \
138 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
139 "=m" (p0[i / sizeof(*p0)])); \
144 p0 = (unsigned long *)((uintptr_t)p0 + 512);
145 p1 = (unsigned long *)((uintptr_t)p1 + 512);
146 p2 = (unsigned long *)((uintptr_t)p2 + 512);
147 p3 = (unsigned long *)((uintptr_t)p3 + 512);
153 static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
154 unsigned long *p2, unsigned long *p3, unsigned long *p4)
156 unsigned long cr0, lines = bytes >> 9;
157 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
163 #define BLOCK(i, reg) \
165 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
166 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
167 "m" (p3[i / sizeof(*p3)])); \
168 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
169 "m" (p2[i / sizeof(*p2)])); \
170 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
171 "m" (p1[i / sizeof(*p1)])); \
172 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
173 "m" (p0[i / sizeof(*p0)])); \
174 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
175 "=m" (p0[i / sizeof(*p0)])); \
180 p0 = (unsigned long *)((uintptr_t)p0 + 512);
181 p1 = (unsigned long *)((uintptr_t)p1 + 512);
182 p2 = (unsigned long *)((uintptr_t)p2 + 512);
183 p3 = (unsigned long *)((uintptr_t)p3 + 512);
184 p4 = (unsigned long *)((uintptr_t)p4 + 512);
190 static struct xor_block_template xor_block_avx = {
198 #define AVX_XOR_SPEED \
201 xor_speed(&xor_block_avx); \
204 #define AVX_SELECT(FASTEST) \
205 (cpu_has_avx ? &xor_block_avx : FASTEST)
209 #define AVX_XOR_SPEED {}
211 #define AVX_SELECT(FASTEST) (FASTEST)