Crypto++
|
00001 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu> 00002 // and Wei Dai from Paulo Baretto's Rijndael implementation 00003 // The original code and all modifications are in the public domain. 00004 00005 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code 00006 00007 /* 00008 July 2010: Added support for AES-NI instructions via compiler intrinsics. 00009 */ 00010 00011 /* 00012 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode 00013 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein 00014 and Peter Schwabe in their paper "New AES software speed records". The round 00015 function was also modified to include a trick similar to one in Brian Gladman's 00016 x86 assembly code, doing an 8-bit register move to minimize the number of 00017 register spills. Also switched to compressed tables and copying round keys to 00018 the stack. 00019 00020 The C++ implementation now uses compressed tables if 00021 CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined. 00022 */ 00023 00024 /* 00025 July 2006: Defense against timing attacks was added in by Wei Dai. 00026 00027 The code now uses smaller tables in the first and last rounds, 00028 and preloads them into L1 cache before usage (by loading at least 00029 one element in each cache line). 00030 00031 We try to delay subsequent accesses to each table (used in the first 00032 and last rounds) until all of the table has been preloaded. Hopefully 00033 the compiler isn't smart enough to optimize that code away. 00034 00035 After preloading the table, we also try not to access any memory location 00036 other than the table and the stack, in order to prevent table entries from 00037 being unloaded from L1 cache, until that round is finished. 00038 (Some popular CPUs have 2-way associative caches.) 00039 */ 00040 00041 // This is the original introductory comment: 00042 00043 /** 00044 * version 3.0 (December 2000) 00045 * 00046 * Optimised ANSI C code for the Rijndael cipher (now AES) 00047 * 00048 * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be> 00049 * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be> 00050 * author Paulo Barreto <paulo.barreto@terra.com.br> 00051 * 00052 * This code is hereby placed in the public domain. 00053 * 00054 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS 00055 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 00056 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00057 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE 00058 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 00059 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 00060 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 00061 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 00062 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 00063 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 00064 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00065 */ 00066 00067 #include "pch.h" 00068 00069 #ifndef CRYPTOPP_IMPORTS 00070 #ifndef CRYPTOPP_GENERATE_X64_MASM 00071 00072 #include "rijndael.h" 00073 #include "misc.h" 00074 #include "cpu.h" 00075 00076 NAMESPACE_BEGIN(CryptoPP) 00077 00078 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS 00079 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) 00080 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];} 00081 using namespace rdtable; 00082 #else 00083 static word64 Te[256]; 00084 #endif 00085 static word64 Td[256]; 00086 #else 00087 static word32 Te[256*4], Td[256*4]; 00088 #endif 00089 static volatile bool s_TeFilled = false, s_TdFilled = false; 00090 00091 // ************************* Portable Code ************************************ 00092 00093 #define QUARTER_ROUND(L, T, t, a, b, c, d) \ 00094 a ^= L(T, 3, byte(t)); t >>= 8;\ 00095 b ^= L(T, 2, byte(t)); t >>= 8;\ 00096 c ^= L(T, 1, byte(t)); t >>= 8;\ 00097 d ^= L(T, 0, t); 00098 00099 #define QUARTER_ROUND_LE(t, a, b, c, d) \ 00100 tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ 00101 tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ 00102 tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ 00103 tempBlock[d] = ((byte *)(Te+t))[1]; 00104 00105 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS 00106 #define QUARTER_ROUND_LD(t, a, b, c, d) \ 00107 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ 00108 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ 00109 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ 00110 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7]; 00111 #else 00112 #define QUARTER_ROUND_LD(t, a, b, c, d) \ 00113 tempBlock[a] = Sd[byte(t)]; t >>= 8;\ 00114 tempBlock[b] = Sd[byte(t)]; t >>= 8;\ 00115 tempBlock[c] = Sd[byte(t)]; t >>= 8;\ 00116 tempBlock[d] = Sd[t]; 00117 #endif 00118 00119 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d) 00120 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d) 00121 00122 #ifdef IS_LITTLE_ENDIAN 00123 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a) 00124 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a) 00125 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS 00126 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1)) 00127 #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1)) 00128 #else 00129 #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8) 00130 #define TL_M(T, i, x) T[i*256 + x] 00131 #endif 00132 #else 00133 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d) 00134 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d) 00135 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS 00136 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4)) 00137 #define TL_M TL_F 00138 #else 00139 #define TL_F(T, i, x) rotrFixed(T[x], i*8) 00140 #define TL_M(T, i, x) T[i*256 + x] 00141 #endif 00142 #endif 00143 00144 00145 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) 00146 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) 00147 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) 00148 00149 #define f3(x) (f2(x) ^ x) 00150 #define f9(x) (f8(x) ^ x) 00151 #define fb(x) (f8(x) ^ f2(x) ^ x) 00152 #define fd(x) (f8(x) ^ f4(x) ^ x) 00153 #define fe(x) (f8(x) ^ f4(x) ^ f2(x)) 00154 00155 void Rijndael::Base::FillEncTable() 00156 { 00157 for (int i=0; i<256; i++) 00158 { 00159 byte x = Se[i]; 00160 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS 00161 word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24; 00162 Te[i] = word64(y | f3(x))<<32 | y; 00163 #else 00164 word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24; 00165 for (int j=0; j<4; j++) 00166 { 00167 Te[i+j*256] = y; 00168 y = rotrFixed(y, 8); 00169 } 00170 #endif 00171 } 00172 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) 00173 Te[256] = Te[257] = 0; 00174 #endif 00175 s_TeFilled = true; 00176 } 00177 00178 void Rijndael::Base::FillDecTable() 00179 { 00180 for (int i=0; i<256; i++) 00181 { 00182 byte x = Sd[i]; 00183 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS 00184 word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24; 00185 Td[i] = word64(y | fb(x))<<32 | y | x; 00186 #else 00187 word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;; 00188 for (int j=0; j<4; j++) 00189 { 00190 Td[i+j*256] = y; 00191 y = rotrFixed(y, 8); 00192 } 00193 #endif 00194 } 00195 s_TdFilled = true; 00196 } 00197 00198 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &) 00199 { 00200 AssertValidKeyLength(keylen); 00201 00202 m_rounds = keylen/4 + 6; 00203 m_key.New(4*(m_rounds+1)); 00204 00205 word32 *rk = m_key; 00206 00207 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86) 00208 // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64 00209 if (HasAESNI()) 00210 { 00211 static const word32 rcLE[] = { 00212 0x01, 0x02, 0x04, 0x08, 00213 0x10, 0x20, 0x40, 0x80, 00214 0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ 00215 }; 00216 const word32 *rc = rcLE; 00217 00218 __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16)); 00219 memcpy(rk, userKey, keylen); 00220 00221 while (true) 00222 { 00223 rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++); 00224 rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; 00225 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; 00226 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; 00227 00228 if (rk + keylen/4 + 4 == m_key.end()) 00229 break; 00230 00231 if (keylen == 24) 00232 { 00233 rk[10] = rk[ 4] ^ rk[ 9]; 00234 rk[11] = rk[ 5] ^ rk[10]; 00235 temp = _mm_insert_epi32(temp, rk[11], 3); 00236 } 00237 else if (keylen == 32) 00238 { 00239 temp = _mm_insert_epi32(temp, rk[11], 3); 00240 rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2); 00241 rk[13] = rk[ 5] ^ rk[12]; 00242 rk[14] = rk[ 6] ^ rk[13]; 00243 rk[15] = rk[ 7] ^ rk[14]; 00244 temp = _mm_insert_epi32(temp, rk[15], 3); 00245 } 00246 else 00247 temp = _mm_insert_epi32(temp, rk[7], 3); 00248 00249 rk += keylen/4; 00250 } 00251 00252 if (!IsForwardTransformation()) 00253 { 00254 rk = m_key; 00255 unsigned int i, j; 00256 00257 std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds)); 00258 00259 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4) 00260 { 00261 temp = _mm_aesimc_si128(*(__m128i *)(rk+i)); 00262 *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j)); 00263 *(__m128i *)(rk+j) = temp; 00264 } 00265 00266 *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i)); 00267 } 00268 00269 return; 00270 } 00271 #endif 00272 00273 GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen); 00274 const word32 *rc = rcon; 00275 word32 temp; 00276 00277 while (true) 00278 { 00279 temp = rk[keylen/4-1]; 00280 word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)]; 00281 rk[keylen/4] = rk[0] ^ x ^ *(rc++); 00282 rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; 00283 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; 00284 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; 00285 00286 if (rk + keylen/4 + 4 == m_key.end()) 00287 break; 00288 00289 if (keylen == 24) 00290 { 00291 rk[10] = rk[ 4] ^ rk[ 9]; 00292 rk[11] = rk[ 5] ^ rk[10]; 00293 } 00294 else if (keylen == 32) 00295 { 00296 temp = rk[11]; 00297 rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)]; 00298 rk[13] = rk[ 5] ^ rk[12]; 00299 rk[14] = rk[ 6] ^ rk[13]; 00300 rk[15] = rk[ 7] ^ rk[14]; 00301 } 00302 rk += keylen/4; 00303 } 00304 00305 rk = m_key; 00306 00307 if (IsForwardTransformation()) 00308 { 00309 if (!s_TeFilled) 00310 FillEncTable(); 00311 00312 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16); 00313 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16); 00314 } 00315 else 00316 { 00317 if (!s_TdFilled) 00318 FillDecTable(); 00319 00320 unsigned int i, j; 00321 00322 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)]) 00323 00324 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4) 00325 { 00326 temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp; 00327 temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp; 00328 temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp; 00329 temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp; 00330 } 00331 00332 rk[i+0] = InverseMixColumn(rk[i+0]); 00333 rk[i+1] = InverseMixColumn(rk[i+1]); 00334 rk[i+2] = InverseMixColumn(rk[i+2]); 00335 rk[i+3] = InverseMixColumn(rk[i+3]); 00336 00337 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp; 00338 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp; 00339 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp; 00340 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp; 00341 } 00342 00343 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 00344 if (HasAESNI()) 00345 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16); 00346 #endif 00347 } 00348 00349 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const 00350 { 00351 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 00352 if (HasSSE2()) 00353 { 00354 Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); 00355 return; 00356 } 00357 #endif 00358 00359 typedef BlockGetAndPut<word32, NativeByteOrder> Block; 00360 00361 word32 s0, s1, s2, s3, t0, t1, t2, t3; 00362 Block::Get(inBlock)(s0)(s1)(s2)(s3); 00363 00364 const word32 *rk = m_key; 00365 s0 ^= rk[0]; 00366 s1 ^= rk[1]; 00367 s2 ^= rk[2]; 00368 s3 ^= rk[3]; 00369 t0 = rk[4]; 00370 t1 = rk[5]; 00371 t2 = rk[6]; 00372 t3 = rk[7]; 00373 rk += 8; 00374 00375 // timing attack countermeasure. see comments at top for more details 00376 const int cacheLineSize = GetCacheLineSize(); 00377 unsigned int i; 00378 word32 u = 0; 00379 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS 00380 for (i=0; i<2048; i+=cacheLineSize) 00381 #else 00382 for (i=0; i<1024; i+=cacheLineSize) 00383 #endif 00384 u &= *(const word32 *)(((const byte *)Te)+i); 00385 u &= Te[255]; 00386 s0 |= u; s1 |= u; s2 |= u; s3 |= u; 00387 00388 QUARTER_ROUND_FE(s3, t0, t1, t2, t3) 00389 QUARTER_ROUND_FE(s2, t3, t0, t1, t2) 00390 QUARTER_ROUND_FE(s1, t2, t3, t0, t1) 00391 QUARTER_ROUND_FE(s0, t1, t2, t3, t0) 00392 00393 // Nr - 2 full rounds: 00394 unsigned int r = m_rounds/2 - 1; 00395 do 00396 { 00397 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; 00398 00399 QUARTER_ROUND_E(t3, s0, s1, s2, s3) 00400 QUARTER_ROUND_E(t2, s3, s0, s1, s2) 00401 QUARTER_ROUND_E(t1, s2, s3, s0, s1) 00402 QUARTER_ROUND_E(t0, s1, s2, s3, s0) 00403 00404 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; 00405 00406 QUARTER_ROUND_E(s3, t0, t1, t2, t3) 00407 QUARTER_ROUND_E(s2, t3, t0, t1, t2) 00408 QUARTER_ROUND_E(s1, t2, t3, t0, t1) 00409 QUARTER_ROUND_E(s0, t1, t2, t3, t0) 00410 00411 rk += 8; 00412 } while (--r); 00413 00414 word32 tbw[4]; 00415 byte *const tempBlock = (byte *)tbw; 00416 00417 QUARTER_ROUND_LE(t2, 15, 2, 5, 8) 00418 QUARTER_ROUND_LE(t1, 11, 14, 1, 4) 00419 QUARTER_ROUND_LE(t0, 7, 10, 13, 0) 00420 QUARTER_ROUND_LE(t3, 3, 6, 9, 12) 00421 00422 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]); 00423 } 00424 00425 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const 00426 { 00427 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 00428 if (HasAESNI()) 00429 { 00430 Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); 00431 return; 00432 } 00433 #endif 00434 00435 typedef BlockGetAndPut<word32, NativeByteOrder> Block; 00436 00437 word32 s0, s1, s2, s3, t0, t1, t2, t3; 00438 Block::Get(inBlock)(s0)(s1)(s2)(s3); 00439 00440 const word32 *rk = m_key; 00441 s0 ^= rk[0]; 00442 s1 ^= rk[1]; 00443 s2 ^= rk[2]; 00444 s3 ^= rk[3]; 00445 t0 = rk[4]; 00446 t1 = rk[5]; 00447 t2 = rk[6]; 00448 t3 = rk[7]; 00449 rk += 8; 00450 00451 // timing attack countermeasure. see comments at top for more details 00452 const int cacheLineSize = GetCacheLineSize(); 00453 unsigned int i; 00454 word32 u = 0; 00455 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS 00456 for (i=0; i<2048; i+=cacheLineSize) 00457 #else 00458 for (i=0; i<1024; i+=cacheLineSize) 00459 #endif 00460 u &= *(const word32 *)(((const byte *)Td)+i); 00461 u &= Td[255]; 00462 s0 |= u; s1 |= u; s2 |= u; s3 |= u; 00463 00464 QUARTER_ROUND_FD(s3, t2, t1, t0, t3) 00465 QUARTER_ROUND_FD(s2, t1, t0, t3, t2) 00466 QUARTER_ROUND_FD(s1, t0, t3, t2, t1) 00467 QUARTER_ROUND_FD(s0, t3, t2, t1, t0) 00468 00469 // Nr - 2 full rounds: 00470 unsigned int r = m_rounds/2 - 1; 00471 do 00472 { 00473 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; 00474 00475 QUARTER_ROUND_D(t3, s2, s1, s0, s3) 00476 QUARTER_ROUND_D(t2, s1, s0, s3, s2) 00477 QUARTER_ROUND_D(t1, s0, s3, s2, s1) 00478 QUARTER_ROUND_D(t0, s3, s2, s1, s0) 00479 00480 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; 00481 00482 QUARTER_ROUND_D(s3, t2, t1, t0, t3) 00483 QUARTER_ROUND_D(s2, t1, t0, t3, t2) 00484 QUARTER_ROUND_D(s1, t0, t3, t2, t1) 00485 QUARTER_ROUND_D(s0, t3, t2, t1, t0) 00486 00487 rk += 8; 00488 } while (--r); 00489 00490 #ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS 00491 // timing attack countermeasure. see comments at top for more details 00492 // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined, 00493 // QUARTER_ROUND_LD will use Td, which is already preloaded. 00494 u = 0; 00495 for (i=0; i<256; i+=cacheLineSize) 00496 u &= *(const word32 *)(Sd+i); 00497 u &= *(const word32 *)(Sd+252); 00498 t0 |= u; t1 |= u; t2 |= u; t3 |= u; 00499 #endif 00500 00501 word32 tbw[4]; 00502 byte *const tempBlock = (byte *)tbw; 00503 00504 QUARTER_ROUND_LD(t2, 7, 2, 13, 8) 00505 QUARTER_ROUND_LD(t1, 3, 14, 9, 4) 00506 QUARTER_ROUND_LD(t0, 15, 10, 5, 0) 00507 QUARTER_ROUND_LD(t3, 11, 6, 1, 12) 00508 00509 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]); 00510 } 00511 00512 // ************************* Assembly Code ************************************ 00513 00514 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code 00515 00516 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM 00517 00518 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 00519 00520 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k) 00521 { 00522 #if CRYPTOPP_BOOL_X86 00523 00524 #define L_REG esp 00525 #define L_INDEX(i) (L_REG+512+i) 00526 #define L_INXORBLOCKS L_INBLOCKS+4 00527 #define L_OUTXORBLOCKS L_INBLOCKS+8 00528 #define L_OUTBLOCKS L_INBLOCKS+12 00529 #define L_INCREMENTS L_INDEX(16*15) 00530 #define L_SP L_INDEX(16*16) 00531 #define L_LENGTH L_INDEX(16*16+4) 00532 #define L_KEYS_BEGIN L_INDEX(16*16+8) 00533 00534 #define MOVD movd 00535 #define MM(i) mm##i 00536 00537 #define MXOR(a,b,c) \ 00538 AS2( movzx esi, b)\ 00539 AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ 00540 AS2( pxor MM(a), mm7)\ 00541 00542 #define MMOV(a,b,c) \ 00543 AS2( movzx esi, b)\ 00544 AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ 00545 00546 #else 00547 00548 #define L_REG r8 00549 #define L_INDEX(i) (L_REG+i) 00550 #define L_INXORBLOCKS L_INBLOCKS+8 00551 #define L_OUTXORBLOCKS L_INBLOCKS+16 00552 #define L_OUTBLOCKS L_INBLOCKS+24 00553 #define L_INCREMENTS L_INDEX(16*16) 00554 #define L_LENGTH L_INDEX(16*18+8) 00555 #define L_KEYS_BEGIN L_INDEX(16*19) 00556 00557 #define MOVD mov 00558 #define MM_0 r9d 00559 #define MM_1 r12d 00560 #ifdef __GNUC__ 00561 #define MM_2 r11d 00562 #else 00563 #define MM_2 r10d 00564 #endif 00565 #define MM(i) MM_##i 00566 00567 #define MXOR(a,b,c) \ 00568 AS2( movzx esi, b)\ 00569 AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ 00570 00571 #define MMOV(a,b,c) \ 00572 AS2( movzx esi, b)\ 00573 AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ 00574 00575 #endif 00576 00577 #define L_SUBKEYS L_INDEX(0) 00578 #define L_SAVED_X L_SUBKEYS 00579 #define L_KEY12 L_INDEX(16*12) 00580 #define L_LASTROUND L_INDEX(16*13) 00581 #define L_INBLOCKS L_INDEX(16*14) 00582 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1) 00583 00584 #define XOR(a,b,c) \ 00585 AS2( movzx esi, b)\ 00586 AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ 00587 00588 #define MOV(a,b,c) \ 00589 AS2( movzx esi, b)\ 00590 AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ 00591 00592 #ifdef CRYPTOPP_GENERATE_X64_MASM 00593 ALIGN 8 00594 Rijndael_Enc_AdvancedProcessBlocks PROC FRAME 00595 rex_push_reg rsi 00596 push_reg rdi 00597 push_reg rbx 00598 push_reg r12 00599 .endprolog 00600 mov L_REG, rcx 00601 mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA 00602 mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA] 00603 #elif defined(__GNUC__) 00604 __asm__ __volatile__ 00605 ( 00606 ".intel_syntax noprefix;" 00607 #if CRYPTOPP_BOOL_X64 00608 AS2( mov L_REG, rcx) 00609 #endif 00610 AS_PUSH_IF86(bx) 00611 AS_PUSH_IF86(bp) 00612 AS2( mov AS_REG_7, WORD_REG(si)) 00613 #else 00614 AS_PUSH_IF86(si) 00615 AS_PUSH_IF86(di) 00616 AS_PUSH_IF86(bx) 00617 AS_PUSH_IF86(bp) 00618 AS2( lea AS_REG_7, [Te]) 00619 AS2( mov edi, [g_cacheLineSize]) 00620 #endif 00621 00622 #if CRYPTOPP_BOOL_X86 00623 AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP 00624 AS2( lea esp, [ecx-512]) 00625 #endif 00626 00627 // copy subkeys to stack 00628 AS2( mov WORD_REG(si), [L_KEYS_BEGIN]) 00629 AS2( mov WORD_REG(ax), 16) 00630 AS2( and WORD_REG(ax), WORD_REG(si)) 00631 AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter) 00632 AS2( movdqa [L_KEY12], xmm3) 00633 AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16]) 00634 AS2( sub WORD_REG(ax), WORD_REG(si)) 00635 ASL(0) 00636 AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)]) 00637 AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0) 00638 AS2( add WORD_REG(si), 16) 00639 AS2( cmp WORD_REG(si), 16*12) 00640 ASJ( jl, 0, b) 00641 00642 // read subkeys 0, 1 and last 00643 AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey 00644 AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0 00645 AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3 00646 AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7 00647 AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11 00648 AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15 00649 00650 // load table into cache 00651 AS2( xor WORD_REG(ax), WORD_REG(ax)) 00652 ASL(9) 00653 AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) 00654 AS2( add WORD_REG(ax), WORD_REG(di)) 00655 AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) 00656 AS2( add WORD_REG(ax), WORD_REG(di)) 00657 AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) 00658 AS2( add WORD_REG(ax), WORD_REG(di)) 00659 AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) 00660 AS2( add WORD_REG(ax), WORD_REG(di)) 00661 AS2( cmp WORD_REG(ax), 2048) 00662 ASJ( jl, 9, b) 00663 AS1( lfence) 00664 00665 AS2( test DWORD PTR [L_LENGTH], 1) 00666 ASJ( jz, 8, f) 00667 00668 // counter mode one-time setup 00669 AS2( mov WORD_REG(si), [L_INBLOCKS]) 00670 AS2( movdqu xmm2, [WORD_REG(si)]) // counter 00671 AS2( pxor xmm2, xmm1) 00672 AS2( psrldq xmm1, 14) 00673 AS2( movd eax, xmm1) 00674 AS2( mov al, BYTE PTR [WORD_REG(si)+15]) 00675 AS2( MOVD MM(2), eax) 00676 #if CRYPTOPP_BOOL_X86 00677 AS2( mov eax, 1) 00678 AS2( movd mm3, eax) 00679 #endif 00680 00681 // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx 00682 AS2( movd eax, xmm2) 00683 AS2( psrldq xmm2, 4) 00684 AS2( movd edi, xmm2) 00685 AS2( psrldq xmm2, 4) 00686 MXOR( 1, al, 0) // 0 00687 XOR( edx, ah, 1) // 1 00688 AS2( shr eax, 16) 00689 XOR( ecx, al, 2) // 2 00690 XOR( ebx, ah, 3) // 3 00691 AS2( mov eax, edi) 00692 AS2( movd edi, xmm2) 00693 AS2( psrldq xmm2, 4) 00694 XOR( ebx, al, 0) // 4 00695 MXOR( 1, ah, 1) // 5 00696 AS2( shr eax, 16) 00697 XOR( edx, al, 2) // 6 00698 XOR( ecx, ah, 3) // 7 00699 AS2( mov eax, edi) 00700 AS2( movd edi, xmm2) 00701 XOR( ecx, al, 0) // 8 00702 XOR( ebx, ah, 1) // 9 00703 AS2( shr eax, 16) 00704 MXOR( 1, al, 2) // 10 00705 XOR( edx, ah, 3) // 11 00706 AS2( mov eax, edi) 00707 XOR( edx, al, 0) // 12 00708 XOR( ecx, ah, 1) // 13 00709 AS2( shr eax, 16) 00710 XOR( ebx, al, 2) // 14 00711 AS2( psrldq xmm2, 3) 00712 00713 // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0 00714 AS2( mov eax, [L_KEY12+0*4]) 00715 AS2( mov edi, [L_KEY12+2*4]) 00716 AS2( MOVD MM(0), [L_KEY12+3*4]) 00717 MXOR( 0, cl, 3) /* 11 */ 00718 XOR( edi, bl, 3) /* 7 */ 00719 MXOR( 0, bh, 2) /* 6 */ 00720 AS2( shr ebx, 16) /* 4,5 */ 00721 XOR( eax, bl, 1) /* 5 */ 00722 MOV( ebx, bh, 0) /* 4 */ 00723 AS2( xor ebx, [L_KEY12+1*4]) 00724 XOR( eax, ch, 2) /* 10 */ 00725 AS2( shr ecx, 16) /* 8,9 */ 00726 XOR( eax, dl, 3) /* 15 */ 00727 XOR( ebx, dh, 2) /* 14 */ 00728 AS2( shr edx, 16) /* 12,13 */ 00729 XOR( edi, ch, 0) /* 8 */ 00730 XOR( ebx, cl, 1) /* 9 */ 00731 XOR( edi, dl, 1) /* 13 */ 00732 MXOR( 0, dh, 0) /* 12 */ 00733 00734 AS2( movd ecx, xmm2) 00735 AS2( MOVD edx, MM(1)) 00736 AS2( MOVD [L_SAVED_X+3*4], MM(0)) 00737 AS2( mov [L_SAVED_X+0*4], eax) 00738 AS2( mov [L_SAVED_X+1*4], ebx) 00739 AS2( mov [L_SAVED_X+2*4], edi) 00740 ASJ( jmp, 5, f) 00741 00742 ASL(3) 00743 // non-counter mode per-block setup 00744 AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3 00745 AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7 00746 AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11 00747 AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15 00748 ASL(8) 00749 AS2( mov WORD_REG(ax), [L_INBLOCKS]) 00750 AS2( movdqu xmm2, [WORD_REG(ax)]) 00751 AS2( mov WORD_REG(si), [L_INXORBLOCKS]) 00752 AS2( movdqu xmm5, [WORD_REG(si)]) 00753 AS2( pxor xmm2, xmm1) 00754 AS2( pxor xmm2, xmm5) 00755 00756 // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx 00757 AS2( movd eax, xmm2) 00758 AS2( psrldq xmm2, 4) 00759 AS2( movd edi, xmm2) 00760 AS2( psrldq xmm2, 4) 00761 MXOR( 1, al, 0) // 0 00762 XOR( edx, ah, 1) // 1 00763 AS2( shr eax, 16) 00764 XOR( ecx, al, 2) // 2 00765 XOR( ebx, ah, 3) // 3 00766 AS2( mov eax, edi) 00767 AS2( movd edi, xmm2) 00768 AS2( psrldq xmm2, 4) 00769 XOR( ebx, al, 0) // 4 00770 MXOR( 1, ah, 1) // 5 00771 AS2( shr eax, 16) 00772 XOR( edx, al, 2) // 6 00773 XOR( ecx, ah, 3) // 7 00774 AS2( mov eax, edi) 00775 AS2( movd edi, xmm2) 00776 XOR( ecx, al, 0) // 8 00777 XOR( ebx, ah, 1) // 9 00778 AS2( shr eax, 16) 00779 MXOR( 1, al, 2) // 10 00780 XOR( edx, ah, 3) // 11 00781 AS2( mov eax, edi) 00782 XOR( edx, al, 0) // 12 00783 XOR( ecx, ah, 1) // 13 00784 AS2( shr eax, 16) 00785 XOR( ebx, al, 2) // 14 00786 MXOR( 1, ah, 3) // 15 00787 AS2( MOVD eax, MM(1)) 00788 00789 AS2( add L_REG, [L_KEYS_BEGIN]) 00790 AS2( add L_REG, 4*16) 00791 ASJ( jmp, 2, f) 00792 00793 ASL(1) 00794 // counter-mode per-block setup 00795 AS2( MOVD ecx, MM(2)) 00796 AS2( MOVD edx, MM(1)) 00797 AS2( mov eax, [L_SAVED_X+0*4]) 00798 AS2( mov ebx, [L_SAVED_X+1*4]) 00799 AS2( xor cl, ch) 00800 AS2( and WORD_REG(cx), 255) 00801 ASL(5) 00802 #if CRYPTOPP_BOOL_X86 00803 AS2( paddb MM(2), mm3) 00804 #else 00805 AS2( add MM(2), 1) 00806 #endif 00807 // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx 00808 AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3]) 00809 XOR( ebx, dl, 3) 00810 MOV( ecx, dh, 2) 00811 AS2( shr edx, 16) 00812 AS2( xor ecx, [L_SAVED_X+2*4]) 00813 XOR( eax, dh, 0) 00814 MOV( edx, dl, 1) 00815 AS2( xor edx, [L_SAVED_X+3*4]) 00816 00817 AS2( add L_REG, [L_KEYS_BEGIN]) 00818 AS2( add L_REG, 3*16) 00819 ASJ( jmp, 4, f) 00820 00821 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15) 00822 // out: eax, ebx, edi, mm0 00823 #define ROUND() \ 00824 MXOR( 0, cl, 3) /* 11 */\ 00825 AS2( mov cl, al) /* 8,9,10,3 */\ 00826 XOR( edi, ah, 2) /* 2 */\ 00827 AS2( shr eax, 16) /* 0,1 */\ 00828 XOR( edi, bl, 3) /* 7 */\ 00829 MXOR( 0, bh, 2) /* 6 */\ 00830 AS2( shr ebx, 16) /* 4,5 */\ 00831 MXOR( 0, al, 1) /* 1 */\ 00832 MOV( eax, ah, 0) /* 0 */\ 00833 XOR( eax, bl, 1) /* 5 */\ 00834 MOV( ebx, bh, 0) /* 4 */\ 00835 XOR( eax, ch, 2) /* 10 */\ 00836 XOR( ebx, cl, 3) /* 3 */\ 00837 AS2( shr ecx, 16) /* 8,9 */\ 00838 XOR( eax, dl, 3) /* 15 */\ 00839 XOR( ebx, dh, 2) /* 14 */\ 00840 AS2( shr edx, 16) /* 12,13 */\ 00841 XOR( edi, ch, 0) /* 8 */\ 00842 XOR( ebx, cl, 1) /* 9 */\ 00843 XOR( edi, dl, 1) /* 13 */\ 00844 MXOR( 0, dh, 0) /* 12 */\ 00845 00846 ASL(2) // 2-round loop 00847 AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4]) 00848 AS2( mov edi, [L_SUBKEYS-4*16+2*4]) 00849 ROUND() 00850 AS2( mov ecx, edi) 00851 AS2( xor eax, [L_SUBKEYS-4*16+0*4]) 00852 AS2( xor ebx, [L_SUBKEYS-4*16+1*4]) 00853 AS2( MOVD edx, MM(0)) 00854 00855 ASL(4) 00856 AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4]) 00857 AS2( mov edi, [L_SUBKEYS-4*16+6*4]) 00858 ROUND() 00859 AS2( mov ecx, edi) 00860 AS2( xor eax, [L_SUBKEYS-4*16+4*4]) 00861 AS2( xor ebx, [L_SUBKEYS-4*16+5*4]) 00862 AS2( MOVD edx, MM(0)) 00863 00864 AS2( add L_REG, 32) 00865 AS2( test L_REG, 255) 00866 ASJ( jnz, 2, b) 00867 AS2( sub L_REG, 16*16) 00868 00869 #define LAST(a, b, c) \ 00870 AS2( movzx esi, a )\ 00871 AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\ 00872 AS2( movzx esi, b )\ 00873 AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\ 00874 AS2( mov WORD PTR [L_LASTROUND+c], di )\ 00875 00876 // last round 00877 LAST(ch, dl, 2) 00878 LAST(dh, al, 6) 00879 AS2( shr edx, 16) 00880 LAST(ah, bl, 10) 00881 AS2( shr eax, 16) 00882 LAST(bh, cl, 14) 00883 AS2( shr ebx, 16) 00884 LAST(dh, al, 12) 00885 AS2( shr ecx, 16) 00886 LAST(ah, bl, 0) 00887 LAST(bh, cl, 4) 00888 LAST(ch, dl, 8) 00889 00890 AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS]) 00891 AS2( mov WORD_REG(bx), [L_OUTBLOCKS]) 00892 00893 AS2( mov WORD_REG(cx), [L_LENGTH]) 00894 AS2( sub WORD_REG(cx), 16) 00895 00896 AS2( movdqu xmm2, [WORD_REG(ax)]) 00897 AS2( pxor xmm2, xmm4) 00898 00899 #if CRYPTOPP_BOOL_X86 00900 AS2( movdqa xmm0, [L_INCREMENTS]) 00901 AS2( paddd xmm0, [L_INBLOCKS]) 00902 AS2( movdqa [L_INBLOCKS], xmm0) 00903 #else 00904 AS2( movdqa xmm0, [L_INCREMENTS+16]) 00905 AS2( paddq xmm0, [L_INBLOCKS+16]) 00906 AS2( movdqa [L_INBLOCKS+16], xmm0) 00907 #endif 00908 00909 AS2( pxor xmm2, [L_LASTROUND]) 00910 AS2( movdqu [WORD_REG(bx)], xmm2) 00911 00912 ASJ( jle, 7, f) 00913 AS2( mov [L_LENGTH], WORD_REG(cx)) 00914 AS2( test WORD_REG(cx), 1) 00915 ASJ( jnz, 1, b) 00916 #if CRYPTOPP_BOOL_X64 00917 AS2( movdqa xmm0, [L_INCREMENTS]) 00918 AS2( paddq xmm0, [L_INBLOCKS]) 00919 AS2( movdqa [L_INBLOCKS], xmm0) 00920 #endif 00921 ASJ( jmp, 3, b) 00922 00923 ASL(7) 00924 // erase keys on stack 00925 AS2( xorps xmm0, xmm0) 00926 AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16]) 00927 AS2( movaps [WORD_REG(ax)-7*16], xmm0) 00928 AS2( movaps [WORD_REG(ax)-6*16], xmm0) 00929 AS2( movaps [WORD_REG(ax)-5*16], xmm0) 00930 AS2( movaps [WORD_REG(ax)-4*16], xmm0) 00931 AS2( movaps [WORD_REG(ax)-3*16], xmm0) 00932 AS2( movaps [WORD_REG(ax)-2*16], xmm0) 00933 AS2( movaps [WORD_REG(ax)-1*16], xmm0) 00934 AS2( movaps [WORD_REG(ax)+0*16], xmm0) 00935 AS2( movaps [WORD_REG(ax)+1*16], xmm0) 00936 AS2( movaps [WORD_REG(ax)+2*16], xmm0) 00937 AS2( movaps [WORD_REG(ax)+3*16], xmm0) 00938 AS2( movaps [WORD_REG(ax)+4*16], xmm0) 00939 AS2( movaps [WORD_REG(ax)+5*16], xmm0) 00940 AS2( movaps [WORD_REG(ax)+6*16], xmm0) 00941 #if CRYPTOPP_BOOL_X86 00942 AS2( mov esp, [L_SP]) 00943 AS1( emms) 00944 #endif 00945 AS_POP_IF86(bp) 00946 AS_POP_IF86(bx) 00947 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86 00948 AS_POP_IF86(di) 00949 AS_POP_IF86(si) 00950 AS1(ret) 00951 #endif 00952 #ifdef CRYPTOPP_GENERATE_X64_MASM 00953 pop r12 00954 pop rbx 00955 pop rdi 00956 pop rsi 00957 ret 00958 Rijndael_Enc_AdvancedProcessBlocks ENDP 00959 #endif 00960 #ifdef __GNUC__ 00961 ".att_syntax prefix;" 00962 : 00963 : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize) 00964 : "memory", "cc", "%eax" 00965 #if CRYPTOPP_BOOL_X64 00966 , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12" 00967 #endif 00968 ); 00969 #endif 00970 } 00971 00972 #endif 00973 00974 #ifndef CRYPTOPP_GENERATE_X64_MASM 00975 00976 #ifdef CRYPTOPP_X64_MASM_AVAILABLE 00977 extern "C" { 00978 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k); 00979 } 00980 #endif 00981 00982 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86 00983 00984 static inline bool AliasedWithTable(const byte *begin, const byte *end) 00985 { 00986 size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096; 00987 size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096; 00988 if (t1 > t0) 00989 return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1); 00990 else 00991 return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0); 00992 } 00993 00994 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 00995 00996 inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds) 00997 { 00998 block = _mm_xor_si128(block, subkeys[0]); 00999 for (unsigned int i=1; i<rounds-1; i+=2) 01000 { 01001 block = _mm_aesenc_si128(block, subkeys[i]); 01002 block = _mm_aesenc_si128(block, subkeys[i+1]); 01003 } 01004 block = _mm_aesenc_si128(block, subkeys[rounds-1]); 01005 block = _mm_aesenclast_si128(block, subkeys[rounds]); 01006 } 01007 01008 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds) 01009 { 01010 __m128i rk = subkeys[0]; 01011 block0 = _mm_xor_si128(block0, rk); 01012 block1 = _mm_xor_si128(block1, rk); 01013 block2 = _mm_xor_si128(block2, rk); 01014 block3 = _mm_xor_si128(block3, rk); 01015 for (unsigned int i=1; i<rounds; i++) 01016 { 01017 rk = subkeys[i]; 01018 block0 = _mm_aesenc_si128(block0, rk); 01019 block1 = _mm_aesenc_si128(block1, rk); 01020 block2 = _mm_aesenc_si128(block2, rk); 01021 block3 = _mm_aesenc_si128(block3, rk); 01022 } 01023 rk = subkeys[rounds]; 01024 block0 = _mm_aesenclast_si128(block0, rk); 01025 block1 = _mm_aesenclast_si128(block1, rk); 01026 block2 = _mm_aesenclast_si128(block2, rk); 01027 block3 = _mm_aesenclast_si128(block3, rk); 01028 } 01029 01030 inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds) 01031 { 01032 block = _mm_xor_si128(block, subkeys[0]); 01033 for (unsigned int i=1; i<rounds-1; i+=2) 01034 { 01035 block = _mm_aesdec_si128(block, subkeys[i]); 01036 block = _mm_aesdec_si128(block, subkeys[i+1]); 01037 } 01038 block = _mm_aesdec_si128(block, subkeys[rounds-1]); 01039 block = _mm_aesdeclast_si128(block, subkeys[rounds]); 01040 } 01041 01042 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds) 01043 { 01044 __m128i rk = subkeys[0]; 01045 block0 = _mm_xor_si128(block0, rk); 01046 block1 = _mm_xor_si128(block1, rk); 01047 block2 = _mm_xor_si128(block2, rk); 01048 block3 = _mm_xor_si128(block3, rk); 01049 for (unsigned int i=1; i<rounds; i++) 01050 { 01051 rk = subkeys[i]; 01052 block0 = _mm_aesdec_si128(block0, rk); 01053 block1 = _mm_aesdec_si128(block1, rk); 01054 block2 = _mm_aesdec_si128(block2, rk); 01055 block3 = _mm_aesdec_si128(block3, rk); 01056 } 01057 rk = subkeys[rounds]; 01058 block0 = _mm_aesdeclast_si128(block0, rk); 01059 block1 = _mm_aesdeclast_si128(block1, rk); 01060 block2 = _mm_aesdeclast_si128(block2, rk); 01061 block3 = _mm_aesdeclast_si128(block3, rk); 01062 } 01063 01064 static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24}; 01065 01066 template <typename F1, typename F4> 01067 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) 01068 { 01069 size_t blockSize = 16; 01070 size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; 01071 size_t xorIncrement = xorBlocks ? blockSize : 0; 01072 size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; 01073 01074 if (flags & BlockTransformation::BT_ReverseDirection) 01075 { 01076 assert(length % blockSize == 0); 01077 inBlocks += length - blockSize; 01078 xorBlocks += length - blockSize; 01079 outBlocks += length - blockSize; 01080 inIncrement = 0-inIncrement; 01081 xorIncrement = 0-xorIncrement; 01082 outIncrement = 0-outIncrement; 01083 } 01084 01085 if (flags & BlockTransformation::BT_AllowParallel) 01086 { 01087 while (length >= 4*blockSize) 01088 { 01089 __m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3; 01090 if (flags & BlockTransformation::BT_InBlockIsCounter) 01091 { 01092 const __m128i be1 = *(const __m128i *)s_one; 01093 block1 = _mm_add_epi32(block0, be1); 01094 block2 = _mm_add_epi32(block1, be1); 01095 block3 = _mm_add_epi32(block2, be1); 01096 _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1)); 01097 } 01098 else 01099 { 01100 inBlocks += inIncrement; 01101 block1 = _mm_loadu_si128((const __m128i *)inBlocks); 01102 inBlocks += inIncrement; 01103 block2 = _mm_loadu_si128((const __m128i *)inBlocks); 01104 inBlocks += inIncrement; 01105 block3 = _mm_loadu_si128((const __m128i *)inBlocks); 01106 inBlocks += inIncrement; 01107 } 01108 01109 if (flags & BlockTransformation::BT_XorInput) 01110 { 01111 block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks)); 01112 xorBlocks += xorIncrement; 01113 block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks)); 01114 xorBlocks += xorIncrement; 01115 block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks)); 01116 xorBlocks += xorIncrement; 01117 block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks)); 01118 xorBlocks += xorIncrement; 01119 } 01120 01121 func4(block0, block1, block2, block3, subkeys, rounds); 01122 01123 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) 01124 { 01125 block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks)); 01126 xorBlocks += xorIncrement; 01127 block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks)); 01128 xorBlocks += xorIncrement; 01129 block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks)); 01130 xorBlocks += xorIncrement; 01131 block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks)); 01132 xorBlocks += xorIncrement; 01133 } 01134 01135 _mm_storeu_si128((__m128i *)outBlocks, block0); 01136 outBlocks += outIncrement; 01137 _mm_storeu_si128((__m128i *)outBlocks, block1); 01138 outBlocks += outIncrement; 01139 _mm_storeu_si128((__m128i *)outBlocks, block2); 01140 outBlocks += outIncrement; 01141 _mm_storeu_si128((__m128i *)outBlocks, block3); 01142 outBlocks += outIncrement; 01143 01144 length -= 4*blockSize; 01145 } 01146 } 01147 01148 while (length >= blockSize) 01149 { 01150 __m128i block = _mm_loadu_si128((const __m128i *)inBlocks); 01151 01152 if (flags & BlockTransformation::BT_XorInput) 01153 block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks)); 01154 01155 if (flags & BlockTransformation::BT_InBlockIsCounter) 01156 const_cast<byte *>(inBlocks)[15]++; 01157 01158 func1(block, subkeys, rounds); 01159 01160 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) 01161 block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks)); 01162 01163 _mm_storeu_si128((__m128i *)outBlocks, block); 01164 01165 inBlocks += inIncrement; 01166 outBlocks += outIncrement; 01167 xorBlocks += xorIncrement; 01168 length -= blockSize; 01169 } 01170 01171 return length; 01172 } 01173 #endif 01174 01175 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const 01176 { 01177 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 01178 if (HasAESNI()) 01179 return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); 01180 #endif 01181 01182 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) 01183 if (HasSSE2()) 01184 { 01185 if (length < BLOCKSIZE) 01186 return length; 01187 01188 struct Locals 01189 { 01190 word32 subkeys[4*12], workspace[8]; 01191 const byte *inBlocks, *inXorBlocks, *outXorBlocks; 01192 byte *outBlocks; 01193 size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement; 01194 size_t regSpill, lengthAndCounterFlag, keysBegin; 01195 }; 01196 01197 size_t increment = BLOCKSIZE; 01198 const byte* zeros = (byte *)(Te+256); 01199 byte *space; 01200 01201 do { 01202 space = (byte *)alloca(255+sizeof(Locals)); 01203 space += (256-(size_t)space%256)%256; 01204 } 01205 while (AliasedWithTable(space, space+sizeof(Locals))); 01206 01207 if (flags & BT_ReverseDirection) 01208 { 01209 assert(length % BLOCKSIZE == 0); 01210 inBlocks += length - BLOCKSIZE; 01211 xorBlocks += length - BLOCKSIZE; 01212 outBlocks += length - BLOCKSIZE; 01213 increment = 0-increment; 01214 } 01215 01216 Locals &locals = *(Locals *)space; 01217 01218 locals.inBlocks = inBlocks; 01219 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros; 01220 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks; 01221 locals.outBlocks = outBlocks; 01222 01223 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment; 01224 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0; 01225 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment; 01226 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment; 01227 01228 locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter); 01229 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2); 01230 locals.keysBegin = (12-keysToCopy)*16; 01231 01232 Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key); 01233 return length % BLOCKSIZE; 01234 } 01235 #endif 01236 01237 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); 01238 } 01239 01240 #endif 01241 01242 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 01243 01244 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const 01245 { 01246 if (HasAESNI()) 01247 return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); 01248 01249 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); 01250 } 01251 01252 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 01253 01254 NAMESPACE_END 01255 01256 #endif 01257 #endif