1 /* AesOpt.c -- Intel's AES
\r
2 2009-11-23 : Igor Pavlov : Public domain */
\r
6 #ifdef MY_CPU_X86_OR_AMD64
\r
8 #define USE_INTEL_AES
\r
12 #ifdef USE_INTEL_AES
\r
14 #include <wmmintrin.h>
\r
16 void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
\r
19 for (; numBlocks != 0; numBlocks--, data++)
\r
21 UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
\r
22 const __m128i *w = p + 3;
\r
23 m = _mm_xor_si128(m, *data);
\r
24 m = _mm_xor_si128(m, p[2]);
\r
27 m = _mm_aesenc_si128(m, w[0]);
\r
28 m = _mm_aesenc_si128(m, w[1]);
\r
31 while (--numRounds2 != 0);
\r
32 m = _mm_aesenc_si128(m, w[0]);
\r
33 m = _mm_aesenclast_si128(m, w[1]);
\r
41 #define AES_OP_W(op, n) { \
\r
42 const __m128i t = w[n]; \
\r
48 #define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n)
\r
49 #define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n)
\r
50 #define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n)
\r
51 #define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n)
\r
53 void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
\r
56 for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
\r
58 UInt32 numRounds2 = *(const UInt32 *)(p + 1);
\r
59 const __m128i *w = p + numRounds2 * 2;
\r
62 const __m128i t = w[2];
\r
63 m0 = _mm_xor_si128(t, data[0]);
\r
64 m1 = _mm_xor_si128(t, data[1]);
\r
65 m2 = _mm_xor_si128(t, data[2]);
\r
74 while (--numRounds2 != 0);
\r
80 t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t;
\r
81 t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t;
\r
82 t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t;
\r
85 for (; numBlocks != 0; numBlocks--, data++)
\r
87 UInt32 numRounds2 = *(const UInt32 *)(p + 1);
\r
88 const __m128i *w = p + numRounds2 * 2;
\r
89 __m128i m = _mm_xor_si128(w[2], *data);
\r
93 m = _mm_aesdec_si128(m, w[1]);
\r
94 m = _mm_aesdec_si128(m, w[0]);
\r
97 while (--numRounds2 != 0);
\r
98 m = _mm_aesdec_si128(m, w[1]);
\r
99 m = _mm_aesdeclast_si128(m, w[0]);
\r
101 m = _mm_xor_si128(m, iv);
\r
108 void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks)
\r
112 one.m128i_u64[0] = 1;
\r
113 one.m128i_u64[1] = 0;
\r
114 for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
\r
116 UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
\r
117 const __m128i *w = p;
\r
118 __m128i m0, m1, m2;
\r
120 const __m128i t = w[2];
\r
121 ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t);
\r
122 ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t);
\r
123 ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t);
\r
132 while (--numRounds2 != 0);
\r
135 data[0] = _mm_xor_si128(data[0], m0);
\r
136 data[1] = _mm_xor_si128(data[1], m1);
\r
137 data[2] = _mm_xor_si128(data[2], m2);
\r
139 for (; numBlocks != 0; numBlocks--, data++)
\r
141 UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
\r
142 const __m128i *w = p;
\r
144 ctr = _mm_add_epi64(ctr, one);
\r
145 m = _mm_xor_si128(ctr, p[2]);
\r
149 m = _mm_aesenc_si128(m, w[0]);
\r
150 m = _mm_aesenc_si128(m, w[1]);
\r
153 while (--numRounds2 != 0);
\r
154 m = _mm_aesenc_si128(m, w[0]);
\r
155 m = _mm_aesenclast_si128(m, w[1]);
\r
156 *data = _mm_xor_si128(*data, m);
\r
163 void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks);
\r
164 void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks);
\r
165 void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks);
\r
167 void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
\r
169 AesCbc_Encode(p, data, numBlocks);
\r
172 void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
\r
174 AesCbc_Decode(p, data, numBlocks);
\r
177 void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks)
\r
179 AesCtr_Code(p, data, numBlocks);
\r