Crypto++
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2010: Added support for AES-NI instructions via compiler intrinsics.
9 */
10 
11 /*
12 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
13 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
14 and Peter Schwabe in their paper "New AES software speed records". The round
15 function was also modified to include a trick similar to one in Brian Gladman's
16 x86 assembly code, doing an 8-bit register move to minimize the number of
17 register spills. Also switched to compressed tables and copying round keys to
18 the stack.
19 
20 The C++ implementation now uses compressed tables if
21 CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.
22 */
23 
24 /*
25 July 2006: Defense against timing attacks was added in by Wei Dai.
26 
27 The code now uses smaller tables in the first and last rounds,
28 and preloads them into L1 cache before usage (by loading at least
29 one element in each cache line).
30 
31 We try to delay subsequent accesses to each table (used in the first
32 and last rounds) until all of the table has been preloaded. Hopefully
33 the compiler isn't smart enough to optimize that code away.
34 
35 After preloading the table, we also try not to access any memory location
36 other than the table and the stack, in order to prevent table entries from
37 being unloaded from L1 cache, until that round is finished.
38 (Some popular CPUs have 2-way associative caches.)
39 */
40 
41 // This is the original introductory comment:
42 
43 /**
44  * version 3.0 (December 2000)
45  *
46  * Optimised ANSI C code for the Rijndael cipher (now AES)
47  *
48  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
49  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
50  * author Paulo Barreto <paulo.barreto@terra.com.br>
51  *
52  * This code is hereby placed in the public domain.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
55  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
56  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
58  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
59  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
60  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
61  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
62  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
63  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
64  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  */
66 
67 #include "pch.h"
68 
69 #ifndef CRYPTOPP_IMPORTS
70 #ifndef CRYPTOPP_GENERATE_X64_MASM
71 
72 #include "rijndael.h"
73 #include "misc.h"
74 #include "cpu.h"
75 
76 NAMESPACE_BEGIN(CryptoPP)
77 
78 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
79 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
80 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
81 using namespace rdtable;
82 #else
83 static word64 Te[256];
84 #endif
85 static word64 Td[256];
86 #else
87 static word32 Te[256*4], Td[256*4];
88 #endif
89 static volatile bool s_TeFilled = false, s_TdFilled = false;
90 
91 // ************************* Portable Code ************************************
92 
93 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
94  a ^= L(T, 3, byte(t)); t >>= 8;\
95  b ^= L(T, 2, byte(t)); t >>= 8;\
96  c ^= L(T, 1, byte(t)); t >>= 8;\
97  d ^= L(T, 0, t);
98 
99 #define QUARTER_ROUND_LE(t, a, b, c, d) \
100  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
101  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
102  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
103  tempBlock[d] = ((byte *)(Te+t))[1];
104 
105 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
106  #define QUARTER_ROUND_LD(t, a, b, c, d) \
107  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
108  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
109  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
110  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
111 #else
112  #define QUARTER_ROUND_LD(t, a, b, c, d) \
113  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
114  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
115  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
116  tempBlock[d] = Sd[t];
117 #endif
118 
119 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
120 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
121 
122 #ifdef IS_LITTLE_ENDIAN
123  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
124  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
125  #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
126  #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1))
127  #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1))
128  #else
129  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
130  #define TL_M(T, i, x) T[i*256 + x]
131  #endif
132 #else
133  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
134  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
135  #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
136  #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4))
137  #define TL_M TL_F
138  #else
139  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
140  #define TL_M(T, i, x) T[i*256 + x]
141  #endif
142 #endif
143 
144 
145 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
146 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
147 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
148 
149 #define f3(x) (f2(x) ^ x)
150 #define f9(x) (f8(x) ^ x)
151 #define fb(x) (f8(x) ^ f2(x) ^ x)
152 #define fd(x) (f8(x) ^ f4(x) ^ x)
153 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
154 
155 void Rijndael::Base::FillEncTable()
156 {
157  for (int i=0; i<256; i++)
158  {
159  byte x = Se[i];
160 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
161  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
162  Te[i] = word64(y | f3(x))<<32 | y;
163 #else
164  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
165  for (int j=0; j<4; j++)
166  {
167  Te[i+j*256] = y;
168  y = rotrFixed(y, 8);
169  }
170 #endif
171  }
172 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
173  Te[256] = Te[257] = 0;
174 #endif
175  s_TeFilled = true;
176 }
177 
178 void Rijndael::Base::FillDecTable()
179 {
180  for (int i=0; i<256; i++)
181  {
182  byte x = Sd[i];
183 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
184  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
185  Td[i] = word64(y | fb(x))<<32 | y | x;
186 #else
187  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
188  for (int j=0; j<4; j++)
189  {
190  Td[i+j*256] = y;
191  y = rotrFixed(y, 8);
192  }
193 #endif
194  }
195  s_TdFilled = true;
196 }
197 
198 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
199 {
200  AssertValidKeyLength(keylen);
201 
202  m_rounds = keylen/4 + 6;
203  m_key.New(4*(m_rounds+1));
204 
205  word32 *rk = m_key;
206 
207 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86)
208  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
209  if (HasAESNI())
210  {
211  static const word32 rcLE[] = {
212  0x01, 0x02, 0x04, 0x08,
213  0x10, 0x20, 0x40, 0x80,
214  0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
215  };
216  const word32 *rc = rcLE;
217 
218  __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16));
219  memcpy(rk, userKey, keylen);
220 
221  while (true)
222  {
223  rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
224  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
225  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
226  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
227 
228  if (rk + keylen/4 + 4 == m_key.end())
229  break;
230 
231  if (keylen == 24)
232  {
233  rk[10] = rk[ 4] ^ rk[ 9];
234  rk[11] = rk[ 5] ^ rk[10];
235  temp = _mm_insert_epi32(temp, rk[11], 3);
236  }
237  else if (keylen == 32)
238  {
239  temp = _mm_insert_epi32(temp, rk[11], 3);
240  rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
241  rk[13] = rk[ 5] ^ rk[12];
242  rk[14] = rk[ 6] ^ rk[13];
243  rk[15] = rk[ 7] ^ rk[14];
244  temp = _mm_insert_epi32(temp, rk[15], 3);
245  }
246  else
247  temp = _mm_insert_epi32(temp, rk[7], 3);
248 
249  rk += keylen/4;
250  }
251 
252  if (!IsForwardTransformation())
253  {
254  rk = m_key;
255  unsigned int i, j;
256 
257  std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
258 
259  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
260  {
261  temp = _mm_aesimc_si128(*(__m128i *)(rk+i));
262  *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j));
263  *(__m128i *)(rk+j) = temp;
264  }
265 
266  *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i));
267  }
268 
269  return;
270  }
271 #endif
272 
273  GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
274  const word32 *rc = rcon;
275  word32 temp;
276 
277  while (true)
278  {
279  temp = rk[keylen/4-1];
280  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
281  rk[keylen/4] = rk[0] ^ x ^ *(rc++);
282  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
283  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
284  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
285 
286  if (rk + keylen/4 + 4 == m_key.end())
287  break;
288 
289  if (keylen == 24)
290  {
291  rk[10] = rk[ 4] ^ rk[ 9];
292  rk[11] = rk[ 5] ^ rk[10];
293  }
294  else if (keylen == 32)
295  {
296  temp = rk[11];
297  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
298  rk[13] = rk[ 5] ^ rk[12];
299  rk[14] = rk[ 6] ^ rk[13];
300  rk[15] = rk[ 7] ^ rk[14];
301  }
302  rk += keylen/4;
303  }
304 
305  rk = m_key;
306 
307  if (IsForwardTransformation())
308  {
309  if (!s_TeFilled)
310  FillEncTable();
311 
312  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
313  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
314  }
315  else
316  {
317  if (!s_TdFilled)
318  FillDecTable();
319 
320  unsigned int i, j;
321 
322 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
323 
324  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
325  {
326  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
327  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
328  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
329  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
330  }
331 
332  rk[i+0] = InverseMixColumn(rk[i+0]);
333  rk[i+1] = InverseMixColumn(rk[i+1]);
334  rk[i+2] = InverseMixColumn(rk[i+2]);
335  rk[i+3] = InverseMixColumn(rk[i+3]);
336 
337  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
338  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
339  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
340  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
341  }
342 
343 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
344  if (HasAESNI())
345  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
346 #endif
347 }
348 
349 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
350 {
351 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
352 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
353  if (HasSSE2())
354 #else
355  if (HasAESNI())
356 #endif
357  {
358  Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
359  return;
360  }
361 #endif
362 
364 
365  word32 s0, s1, s2, s3, t0, t1, t2, t3;
366  Block::Get(inBlock)(s0)(s1)(s2)(s3);
367 
368  const word32 *rk = m_key;
369  s0 ^= rk[0];
370  s1 ^= rk[1];
371  s2 ^= rk[2];
372  s3 ^= rk[3];
373  t0 = rk[4];
374  t1 = rk[5];
375  t2 = rk[6];
376  t3 = rk[7];
377  rk += 8;
378 
379  // timing attack countermeasure. see comments at top for more details.
380  // also see http://github.com/weidai11/cryptopp/issues/146
381  const int cacheLineSize = GetCacheLineSize();
382  unsigned int i;
383  volatile word32 _u = 0;
384  word32 u = _u;
385 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
386  for (i=0; i<2048; i+=cacheLineSize)
387 #else
388  for (i=0; i<1024; i+=cacheLineSize)
389 #endif
390  u &= *(const word32 *)(((const byte *)Te)+i);
391  u &= Te[255];
392  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
393 
394  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
395  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
396  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
397  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
398 
399  // Nr - 2 full rounds:
400  unsigned int r = m_rounds/2 - 1;
401  do
402  {
403  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
404 
405  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
406  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
407  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
408  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
409 
410  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
411 
412  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
413  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
414  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
415  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
416 
417  rk += 8;
418  } while (--r);
419 
420  word32 tbw[4];
421  byte *const tempBlock = (byte *)tbw;
422 
423  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
424  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
425  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
426  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
427 
428  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
429 }
430 
431 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
432 {
433 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
434  if (HasAESNI())
435  {
436  Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
437  return;
438  }
439 #endif
440 
442 
443  word32 s0, s1, s2, s3, t0, t1, t2, t3;
444  Block::Get(inBlock)(s0)(s1)(s2)(s3);
445 
446  const word32 *rk = m_key;
447  s0 ^= rk[0];
448  s1 ^= rk[1];
449  s2 ^= rk[2];
450  s3 ^= rk[3];
451  t0 = rk[4];
452  t1 = rk[5];
453  t2 = rk[6];
454  t3 = rk[7];
455  rk += 8;
456 
457  // timing attack countermeasure. see comments at top for more details.
458  // also see http://github.com/weidai11/cryptopp/issues/146
459  const int cacheLineSize = GetCacheLineSize();
460  unsigned int i;
461  volatile word32 _u = 0;
462  word32 u = _u;
463 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
464  for (i=0; i<2048; i+=cacheLineSize)
465 #else
466  for (i=0; i<1024; i+=cacheLineSize)
467 #endif
468  u &= *(const word32 *)(((const byte *)Td)+i);
469  u &= Td[255];
470  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
471 
472  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
473  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
474  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
475  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
476 
477  // Nr - 2 full rounds:
478  unsigned int r = m_rounds/2 - 1;
479  do
480  {
481  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
482 
483  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
484  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
485  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
486  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
487 
488  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
489 
490  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
491  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
492  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
493  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
494 
495  rk += 8;
496  } while (--r);
497 
498 #ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
499  // timing attack countermeasure. see comments at top for more details
500  // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined,
501  // QUARTER_ROUND_LD will use Td, which is already preloaded.
502  u = _u;
503  for (i=0; i<256; i+=cacheLineSize)
504  u &= *(const word32 *)(Sd+i);
505  u &= *(const word32 *)(Sd+252);
506  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
507 #endif
508 
509  word32 tbw[4];
510  byte *const tempBlock = (byte *)tbw;
511 
512  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
513  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
514  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
515  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
516 
517  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
518 }
519 
520 // ************************* Assembly Code ************************************
521 
522 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
523 
524 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
525 
526 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
527 
528 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
529 {
530 #if CRYPTOPP_BOOL_X86
531 
532 #define L_REG esp
533 #define L_INDEX(i) (L_REG+768+i)
534 #define L_INXORBLOCKS L_INBLOCKS+4
535 #define L_OUTXORBLOCKS L_INBLOCKS+8
536 #define L_OUTBLOCKS L_INBLOCKS+12
537 #define L_INCREMENTS L_INDEX(16*15)
538 #define L_SP L_INDEX(16*16)
539 #define L_LENGTH L_INDEX(16*16+4)
540 #define L_KEYS_BEGIN L_INDEX(16*16+8)
541 
542 #define MOVD movd
543 #define MM(i) mm##i
544 
545 #define MXOR(a,b,c) \
546  AS2( movzx esi, b)\
547  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
548  AS2( pxor MM(a), mm7)\
549 
550 #define MMOV(a,b,c) \
551  AS2( movzx esi, b)\
552  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
553 
554 #else
555 
556 #define L_REG r8
557 #define L_INDEX(i) (L_REG+i)
558 #define L_INXORBLOCKS L_INBLOCKS+8
559 #define L_OUTXORBLOCKS L_INBLOCKS+16
560 #define L_OUTBLOCKS L_INBLOCKS+24
561 #define L_INCREMENTS L_INDEX(16*16)
562 #define L_LENGTH L_INDEX(16*18+8)
563 #define L_KEYS_BEGIN L_INDEX(16*19)
564 
565 #define MOVD mov
566 #define MM_0 r9d
567 #define MM_1 r12d
568 #ifdef __GNUC__
569 #define MM_2 r11d
570 #else
571 #define MM_2 r10d
572 #endif
573 #define MM(i) MM_##i
574 
575 #define MXOR(a,b,c) \
576  AS2( movzx esi, b)\
577  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
578 
579 #define MMOV(a,b,c) \
580  AS2( movzx esi, b)\
581  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
582 
583 #endif
584 
585 #define L_SUBKEYS L_INDEX(0)
586 #define L_SAVED_X L_SUBKEYS
587 #define L_KEY12 L_INDEX(16*12)
588 #define L_LASTROUND L_INDEX(16*13)
589 #define L_INBLOCKS L_INDEX(16*14)
590 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
591 
592 #define XOR(a,b,c) \
593  AS2( movzx esi, b)\
594  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
595 
596 #define MOV(a,b,c) \
597  AS2( movzx esi, b)\
598  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
599 
600 #ifdef CRYPTOPP_GENERATE_X64_MASM
601  ALIGN 8
602  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
603  rex_push_reg rsi
604  push_reg rdi
605  push_reg rbx
606  push_reg r12
607  .endprolog
608  mov L_REG, rcx
609  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
610  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
611 #elif defined(__GNUC__)
612  __asm__ __volatile__
613  (
614  ".intel_syntax noprefix;"
615  #if CRYPTOPP_BOOL_X64
616  AS2( mov L_REG, rcx)
617  #endif
618  AS_PUSH_IF86(bx)
619  AS_PUSH_IF86(bp)
620  AS2( mov AS_REG_7, WORD_REG(si))
621 #else
622  AS_PUSH_IF86(si)
623  AS_PUSH_IF86(di)
624  AS_PUSH_IF86(bx)
625  AS_PUSH_IF86(bp)
626  AS2( lea AS_REG_7, [Te])
627  AS2( mov edi, [g_cacheLineSize])
628 #endif
629 
630 #if CRYPTOPP_BOOL_X86
631  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
632  AS2( lea esp, [ecx-768])
633 #endif
634 
635  // copy subkeys to stack
636  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
637  AS2( mov WORD_REG(ax), 16)
638  AS2( and WORD_REG(ax), WORD_REG(si))
639  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
640  AS2( movdqa [L_KEY12], xmm3)
641  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
642  AS2( sub WORD_REG(ax), WORD_REG(si))
643  ASL(0)
644  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
645  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
646  AS2( add WORD_REG(si), 16)
647  AS2( cmp WORD_REG(si), 16*12)
648  ASJ( jl, 0, b)
649 
650  // read subkeys 0, 1 and last
651  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
652  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
653  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
654  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
655  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
656  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
657 
658  // load table into cache
659  AS2( xor WORD_REG(ax), WORD_REG(ax))
660  ASL(9)
661  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
662  AS2( add WORD_REG(ax), WORD_REG(di))
663  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
664  AS2( add WORD_REG(ax), WORD_REG(di))
665  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
666  AS2( add WORD_REG(ax), WORD_REG(di))
667  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
668  AS2( add WORD_REG(ax), WORD_REG(di))
669  AS2( cmp WORD_REG(ax), 2048)
670  ASJ( jl, 9, b)
671  AS1( lfence)
672 
673  AS2( test DWORD PTR [L_LENGTH], 1)
674  ASJ( jz, 8, f)
675 
676  // counter mode one-time setup
677  AS2( mov WORD_REG(si), [L_INBLOCKS])
678  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
679  AS2( pxor xmm2, xmm1)
680  AS2( psrldq xmm1, 14)
681  AS2( movd eax, xmm1)
682  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
683  AS2( MOVD MM(2), eax)
684 #if CRYPTOPP_BOOL_X86
685  AS2( mov eax, 1)
686  AS2( movd mm3, eax)
687 #endif
688 
689  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
690  AS2( movd eax, xmm2)
691  AS2( psrldq xmm2, 4)
692  AS2( movd edi, xmm2)
693  AS2( psrldq xmm2, 4)
694  MXOR( 1, al, 0) // 0
695  XOR( edx, ah, 1) // 1
696  AS2( shr eax, 16)
697  XOR( ecx, al, 2) // 2
698  XOR( ebx, ah, 3) // 3
699  AS2( mov eax, edi)
700  AS2( movd edi, xmm2)
701  AS2( psrldq xmm2, 4)
702  XOR( ebx, al, 0) // 4
703  MXOR( 1, ah, 1) // 5
704  AS2( shr eax, 16)
705  XOR( edx, al, 2) // 6
706  XOR( ecx, ah, 3) // 7
707  AS2( mov eax, edi)
708  AS2( movd edi, xmm2)
709  XOR( ecx, al, 0) // 8
710  XOR( ebx, ah, 1) // 9
711  AS2( shr eax, 16)
712  MXOR( 1, al, 2) // 10
713  XOR( edx, ah, 3) // 11
714  AS2( mov eax, edi)
715  XOR( edx, al, 0) // 12
716  XOR( ecx, ah, 1) // 13
717  AS2( shr eax, 16)
718  XOR( ebx, al, 2) // 14
719  AS2( psrldq xmm2, 3)
720 
721  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
722  AS2( mov eax, [L_KEY12+0*4])
723  AS2( mov edi, [L_KEY12+2*4])
724  AS2( MOVD MM(0), [L_KEY12+3*4])
725  MXOR( 0, cl, 3) /* 11 */
726  XOR( edi, bl, 3) /* 7 */
727  MXOR( 0, bh, 2) /* 6 */
728  AS2( shr ebx, 16) /* 4,5 */
729  XOR( eax, bl, 1) /* 5 */
730  MOV( ebx, bh, 0) /* 4 */
731  AS2( xor ebx, [L_KEY12+1*4])
732  XOR( eax, ch, 2) /* 10 */
733  AS2( shr ecx, 16) /* 8,9 */
734  XOR( eax, dl, 3) /* 15 */
735  XOR( ebx, dh, 2) /* 14 */
736  AS2( shr edx, 16) /* 12,13 */
737  XOR( edi, ch, 0) /* 8 */
738  XOR( ebx, cl, 1) /* 9 */
739  XOR( edi, dl, 1) /* 13 */
740  MXOR( 0, dh, 0) /* 12 */
741 
742  AS2( movd ecx, xmm2)
743  AS2( MOVD edx, MM(1))
744  AS2( MOVD [L_SAVED_X+3*4], MM(0))
745  AS2( mov [L_SAVED_X+0*4], eax)
746  AS2( mov [L_SAVED_X+1*4], ebx)
747  AS2( mov [L_SAVED_X+2*4], edi)
748  ASJ( jmp, 5, f)
749 
750  ASL(3)
751  // non-counter mode per-block setup
752  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
753  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
754  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
755  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
756  ASL(8)
757  AS2( mov WORD_REG(ax), [L_INBLOCKS])
758  AS2( movdqu xmm2, [WORD_REG(ax)])
759  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
760  AS2( movdqu xmm5, [WORD_REG(si)])
761  AS2( pxor xmm2, xmm1)
762  AS2( pxor xmm2, xmm5)
763 
764  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
765  AS2( movd eax, xmm2)
766  AS2( psrldq xmm2, 4)
767  AS2( movd edi, xmm2)
768  AS2( psrldq xmm2, 4)
769  MXOR( 1, al, 0) // 0
770  XOR( edx, ah, 1) // 1
771  AS2( shr eax, 16)
772  XOR( ecx, al, 2) // 2
773  XOR( ebx, ah, 3) // 3
774  AS2( mov eax, edi)
775  AS2( movd edi, xmm2)
776  AS2( psrldq xmm2, 4)
777  XOR( ebx, al, 0) // 4
778  MXOR( 1, ah, 1) // 5
779  AS2( shr eax, 16)
780  XOR( edx, al, 2) // 6
781  XOR( ecx, ah, 3) // 7
782  AS2( mov eax, edi)
783  AS2( movd edi, xmm2)
784  XOR( ecx, al, 0) // 8
785  XOR( ebx, ah, 1) // 9
786  AS2( shr eax, 16)
787  MXOR( 1, al, 2) // 10
788  XOR( edx, ah, 3) // 11
789  AS2( mov eax, edi)
790  XOR( edx, al, 0) // 12
791  XOR( ecx, ah, 1) // 13
792  AS2( shr eax, 16)
793  XOR( ebx, al, 2) // 14
794  MXOR( 1, ah, 3) // 15
795  AS2( MOVD eax, MM(1))
796 
797  AS2( add L_REG, [L_KEYS_BEGIN])
798  AS2( add L_REG, 4*16)
799  ASJ( jmp, 2, f)
800 
801  ASL(1)
802  // counter-mode per-block setup
803  AS2( MOVD ecx, MM(2))
804  AS2( MOVD edx, MM(1))
805  AS2( mov eax, [L_SAVED_X+0*4])
806  AS2( mov ebx, [L_SAVED_X+1*4])
807  AS2( xor cl, ch)
808  AS2( and WORD_REG(cx), 255)
809  ASL(5)
810 #if CRYPTOPP_BOOL_X86
811  AS2( paddb MM(2), mm3)
812 #else
813  AS2( add MM(2), 1)
814 #endif
815  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
816  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
817  XOR( ebx, dl, 3)
818  MOV( ecx, dh, 2)
819  AS2( shr edx, 16)
820  AS2( xor ecx, [L_SAVED_X+2*4])
821  XOR( eax, dh, 0)
822  MOV( edx, dl, 1)
823  AS2( xor edx, [L_SAVED_X+3*4])
824 
825  AS2( add L_REG, [L_KEYS_BEGIN])
826  AS2( add L_REG, 3*16)
827  ASJ( jmp, 4, f)
828 
829 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
830 // out: eax, ebx, edi, mm0
831 #define ROUND() \
832  MXOR( 0, cl, 3) /* 11 */\
833  AS2( mov cl, al) /* 8,9,10,3 */\
834  XOR( edi, ah, 2) /* 2 */\
835  AS2( shr eax, 16) /* 0,1 */\
836  XOR( edi, bl, 3) /* 7 */\
837  MXOR( 0, bh, 2) /* 6 */\
838  AS2( shr ebx, 16) /* 4,5 */\
839  MXOR( 0, al, 1) /* 1 */\
840  MOV( eax, ah, 0) /* 0 */\
841  XOR( eax, bl, 1) /* 5 */\
842  MOV( ebx, bh, 0) /* 4 */\
843  XOR( eax, ch, 2) /* 10 */\
844  XOR( ebx, cl, 3) /* 3 */\
845  AS2( shr ecx, 16) /* 8,9 */\
846  XOR( eax, dl, 3) /* 15 */\
847  XOR( ebx, dh, 2) /* 14 */\
848  AS2( shr edx, 16) /* 12,13 */\
849  XOR( edi, ch, 0) /* 8 */\
850  XOR( ebx, cl, 1) /* 9 */\
851  XOR( edi, dl, 1) /* 13 */\
852  MXOR( 0, dh, 0) /* 12 */\
853 
854  ASL(2) // 2-round loop
855  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
856  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
857  ROUND()
858  AS2( mov ecx, edi)
859  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
860  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
861  AS2( MOVD edx, MM(0))
862 
863  ASL(4)
864  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
865  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
866  ROUND()
867  AS2( mov ecx, edi)
868  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
869  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
870  AS2( MOVD edx, MM(0))
871 
872  AS2( add L_REG, 32)
873  AS2( test L_REG, 255)
874  ASJ( jnz, 2, b)
875  AS2( sub L_REG, 16*16)
876 
877 #define LAST(a, b, c) \
878  AS2( movzx esi, a )\
879  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
880  AS2( movzx esi, b )\
881  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
882  AS2( mov WORD PTR [L_LASTROUND+c], di )\
883 
884  // last round
885  LAST(ch, dl, 2)
886  LAST(dh, al, 6)
887  AS2( shr edx, 16)
888  LAST(ah, bl, 10)
889  AS2( shr eax, 16)
890  LAST(bh, cl, 14)
891  AS2( shr ebx, 16)
892  LAST(dh, al, 12)
893  AS2( shr ecx, 16)
894  LAST(ah, bl, 0)
895  LAST(bh, cl, 4)
896  LAST(ch, dl, 8)
897 
898  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
899  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
900 
901  AS2( mov WORD_REG(cx), [L_LENGTH])
902  AS2( sub WORD_REG(cx), 16)
903 
904  AS2( movdqu xmm2, [WORD_REG(ax)])
905  AS2( pxor xmm2, xmm4)
906 
907 #if CRYPTOPP_BOOL_X86
908  AS2( movdqa xmm0, [L_INCREMENTS])
909  AS2( paddd xmm0, [L_INBLOCKS])
910  AS2( movdqa [L_INBLOCKS], xmm0)
911 #else
912  AS2( movdqa xmm0, [L_INCREMENTS+16])
913  AS2( paddq xmm0, [L_INBLOCKS+16])
914  AS2( movdqa [L_INBLOCKS+16], xmm0)
915 #endif
916 
917  AS2( pxor xmm2, [L_LASTROUND])
918  AS2( movdqu [WORD_REG(bx)], xmm2)
919 
920  ASJ( jle, 7, f)
921  AS2( mov [L_LENGTH], WORD_REG(cx))
922  AS2( test WORD_REG(cx), 1)
923  ASJ( jnz, 1, b)
924 #if CRYPTOPP_BOOL_X64
925  AS2( movdqa xmm0, [L_INCREMENTS])
926  AS2( paddq xmm0, [L_INBLOCKS])
927  AS2( movdqa [L_INBLOCKS], xmm0)
928 #endif
929  ASJ( jmp, 3, b)
930 
931  ASL(7)
932  // erase keys on stack
933  AS2( xorps xmm0, xmm0)
934  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
935  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
936  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
937  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
938  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
939  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
940  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
941  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
942  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
943  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
944  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
945  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
946  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
947  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
948  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
949 #if CRYPTOPP_BOOL_X86
950  AS2( mov esp, [L_SP])
951  AS1( emms)
952 #endif
953  AS_POP_IF86(bp)
954  AS_POP_IF86(bx)
955 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
956  AS_POP_IF86(di)
957  AS_POP_IF86(si)
958  AS1(ret)
959 #endif
960 #ifdef CRYPTOPP_GENERATE_X64_MASM
961  pop r12
962  pop rbx
963  pop rdi
964  pop rsi
965  ret
966  Rijndael_Enc_AdvancedProcessBlocks ENDP
967 #endif
968 #ifdef __GNUC__
969  ".att_syntax prefix;"
970  :
971  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
972  : "memory", "cc", "%eax"
973  #if CRYPTOPP_BOOL_X64
974  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
975  #endif
976  );
977 #endif
978 }
979 
980 #endif
981 
982 #ifndef CRYPTOPP_GENERATE_X64_MASM
983 
984 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
985 extern "C" {
986 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
987 }
988 #endif
989 
990 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86
991 
992 static inline bool AliasedWithTable(const byte *begin, const byte *end)
993 {
994  size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
995  size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
996  if (t1 > t0)
997  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
998  else
999  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
1000 }
1001 
1002 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1003 
1004 inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
1005 {
1006  block = _mm_xor_si128(block, subkeys[0]);
1007  for (unsigned int i=1; i<rounds-1; i+=2)
1008  {
1009  block = _mm_aesenc_si128(block, subkeys[i]);
1010  block = _mm_aesenc_si128(block, subkeys[i+1]);
1011  }
1012  block = _mm_aesenc_si128(block, subkeys[rounds-1]);
1013  block = _mm_aesenclast_si128(block, subkeys[rounds]);
1014 }
1015 
1016 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1017 {
1018  __m128i rk = subkeys[0];
1019  block0 = _mm_xor_si128(block0, rk);
1020  block1 = _mm_xor_si128(block1, rk);
1021  block2 = _mm_xor_si128(block2, rk);
1022  block3 = _mm_xor_si128(block3, rk);
1023  for (unsigned int i=1; i<rounds; i++)
1024  {
1025  rk = subkeys[i];
1026  block0 = _mm_aesenc_si128(block0, rk);
1027  block1 = _mm_aesenc_si128(block1, rk);
1028  block2 = _mm_aesenc_si128(block2, rk);
1029  block3 = _mm_aesenc_si128(block3, rk);
1030  }
1031  rk = subkeys[rounds];
1032  block0 = _mm_aesenclast_si128(block0, rk);
1033  block1 = _mm_aesenclast_si128(block1, rk);
1034  block2 = _mm_aesenclast_si128(block2, rk);
1035  block3 = _mm_aesenclast_si128(block3, rk);
1036 }
1037 
1038 inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
1039 {
1040  block = _mm_xor_si128(block, subkeys[0]);
1041  for (unsigned int i=1; i<rounds-1; i+=2)
1042  {
1043  block = _mm_aesdec_si128(block, subkeys[i]);
1044  block = _mm_aesdec_si128(block, subkeys[i+1]);
1045  }
1046  block = _mm_aesdec_si128(block, subkeys[rounds-1]);
1047  block = _mm_aesdeclast_si128(block, subkeys[rounds]);
1048 }
1049 
1050 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1051 {
1052  __m128i rk = subkeys[0];
1053  block0 = _mm_xor_si128(block0, rk);
1054  block1 = _mm_xor_si128(block1, rk);
1055  block2 = _mm_xor_si128(block2, rk);
1056  block3 = _mm_xor_si128(block3, rk);
1057  for (unsigned int i=1; i<rounds; i++)
1058  {
1059  rk = subkeys[i];
1060  block0 = _mm_aesdec_si128(block0, rk);
1061  block1 = _mm_aesdec_si128(block1, rk);
1062  block2 = _mm_aesdec_si128(block2, rk);
1063  block3 = _mm_aesdec_si128(block3, rk);
1064  }
1065  rk = subkeys[rounds];
1066  block0 = _mm_aesdeclast_si128(block0, rk);
1067  block1 = _mm_aesdeclast_si128(block1, rk);
1068  block2 = _mm_aesdeclast_si128(block2, rk);
1069  block3 = _mm_aesdeclast_si128(block3, rk);
1070 }
1071 
1072 static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
1073 
1074 template <typename F1, typename F4>
1075 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1076 {
1077  size_t blockSize = 16;
1078  size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1079  size_t xorIncrement = xorBlocks ? blockSize : 0;
1080  size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
1081 
1082  if (flags & BlockTransformation::BT_ReverseDirection)
1083  {
1084  assert(length % blockSize == 0);
1085  inBlocks += length - blockSize;
1086  xorBlocks += length - blockSize;
1087  outBlocks += length - blockSize;
1088  inIncrement = 0-inIncrement;
1089  xorIncrement = 0-xorIncrement;
1090  outIncrement = 0-outIncrement;
1091  }
1092 
1093  if (flags & BlockTransformation::BT_AllowParallel)
1094  {
1095  while (length >= 4*blockSize)
1096  {
1097  __m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3;
1098  if (flags & BlockTransformation::BT_InBlockIsCounter)
1099  {
1100  const __m128i be1 = *(const __m128i *)s_one;
1101  block1 = _mm_add_epi32(block0, be1);
1102  block2 = _mm_add_epi32(block1, be1);
1103  block3 = _mm_add_epi32(block2, be1);
1104  _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1));
1105  }
1106  else
1107  {
1108  inBlocks += inIncrement;
1109  block1 = _mm_loadu_si128((const __m128i *)inBlocks);
1110  inBlocks += inIncrement;
1111  block2 = _mm_loadu_si128((const __m128i *)inBlocks);
1112  inBlocks += inIncrement;
1113  block3 = _mm_loadu_si128((const __m128i *)inBlocks);
1114  inBlocks += inIncrement;
1115  }
1116 
1117  if (flags & BlockTransformation::BT_XorInput)
1118  {
1119  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
1120  xorBlocks += xorIncrement;
1121  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
1122  xorBlocks += xorIncrement;
1123  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
1124  xorBlocks += xorIncrement;
1125  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
1126  xorBlocks += xorIncrement;
1127  }
1128 
1129  func4(block0, block1, block2, block3, subkeys, rounds);
1130 
1131  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1132  {
1133  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
1134  xorBlocks += xorIncrement;
1135  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
1136  xorBlocks += xorIncrement;
1137  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
1138  xorBlocks += xorIncrement;
1139  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
1140  xorBlocks += xorIncrement;
1141  }
1142 
1143  _mm_storeu_si128((__m128i *)outBlocks, block0);
1144  outBlocks += outIncrement;
1145  _mm_storeu_si128((__m128i *)outBlocks, block1);
1146  outBlocks += outIncrement;
1147  _mm_storeu_si128((__m128i *)outBlocks, block2);
1148  outBlocks += outIncrement;
1149  _mm_storeu_si128((__m128i *)outBlocks, block3);
1150  outBlocks += outIncrement;
1151 
1152  length -= 4*blockSize;
1153  }
1154  }
1155 
1156  while (length >= blockSize)
1157  {
1158  __m128i block = _mm_loadu_si128((const __m128i *)inBlocks);
1159 
1160  if (flags & BlockTransformation::BT_XorInput)
1161  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
1162 
1163  if (flags & BlockTransformation::BT_InBlockIsCounter)
1164  const_cast<byte *>(inBlocks)[15]++;
1165 
1166  func1(block, subkeys, rounds);
1167 
1168  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1169  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
1170 
1171  _mm_storeu_si128((__m128i *)outBlocks, block);
1172 
1173  inBlocks += inIncrement;
1174  outBlocks += outIncrement;
1175  xorBlocks += xorIncrement;
1176  length -= blockSize;
1177  }
1178 
1179  return length;
1180 }
1181 #endif
1182 
1183 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1184 {
1185 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1186  if (HasAESNI())
1187  return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1188 #endif
1189 
1190 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
1191  if (HasSSE2())
1192  {
1193  if (length < BLOCKSIZE)
1194  return length;
1195 
1196  struct Locals
1197  {
1198  word32 subkeys[4*12], workspace[8];
1199  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
1200  byte *outBlocks;
1201  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
1202  size_t regSpill, lengthAndCounterFlag, keysBegin;
1203  };
1204 
1205  size_t increment = BLOCKSIZE;
1206  const byte* zeros = (byte *)(Te+256);
1207  byte *space;
1208 
1209  do {
1210  space = (byte *)alloca(255+sizeof(Locals));
1211  space += (256-(size_t)space%256)%256;
1212  }
1213  while (AliasedWithTable(space, space+sizeof(Locals)));
1214 
1215  if (flags & BT_ReverseDirection)
1216  {
1217  assert(length % BLOCKSIZE == 0);
1218  inBlocks += length - BLOCKSIZE;
1219  xorBlocks += length - BLOCKSIZE;
1220  outBlocks += length - BLOCKSIZE;
1221  increment = 0-increment;
1222  }
1223 
1224  Locals &locals = *(Locals *)space;
1225 
1226  locals.inBlocks = inBlocks;
1227  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1228  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1229  locals.outBlocks = outBlocks;
1230 
1231  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1232  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1233  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1234  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1235 
1236  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1237  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1238  locals.keysBegin = (12-keysToCopy)*16;
1239 
1240  Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1241  return length % BLOCKSIZE;
1242  }
1243 #endif
1244 
1245  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1246 }
1247 
1248 #endif
1249 
1250 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1251 
1252 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1253 {
1254  if (HasAESNI())
1255  return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1256 
1257  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1258 }
1259 
1260 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1261 
1262 NAMESPACE_END
1263 
1264 #endif
1265 #endif
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
encrypt and xor blocks according to flags (see FlagsForAdvancedProcessBlocks)
Definition: cryptlib.cpp:141
Rijndael
Definition: rijndael.h:19
interface for retrieving values given their names
Definition: cryptlib.h:225