- Timestamp:
- Feb 7, 2019, 8:36:33 AM (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
asp3_tinet_ecnl_rx/trunk/wolfssl-3.12.2/wolfcrypt/src/aes.c
r359 r372 28 28 #include <wolfssl/wolfcrypt/error-crypt.h> 29 29 30 #ifndef NO_AES 30 #if !defined(NO_AES) 31 32 /* Tip: Locate the software cipher modes by searching for "Software AES" */ 33 34 #if defined(HAVE_FIPS) && \ 35 defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2) 36 37 /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */ 38 #define FIPS_NO_WRAPPERS 39 40 #ifdef USE_WINDOWS_API 41 #pragma code_seg(".fipsA$g") 42 #pragma const_seg(".fipsB$g") 43 #endif 44 #endif 45 31 46 #include <wolfssl/wolfcrypt/aes.h> 32 47 #include <wolfssl/wolfcrypt/cpuid.h> 33 48 49 #ifdef WOLF_CRYPTO_DEV 50 #include <wolfssl/wolfcrypt/cryptodev.h> 51 #endif 52 34 53 35 54 /* fips wrapper calls, user can call direct */ 36 #ifdef HAVE_FIPS 55 #if defined(HAVE_FIPS) && \ 56 (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION < 2)) 57 37 58 int wc_AesSetKey(Aes* aes, const byte* key, word32 len, const byte* iv, 38 59 int dir) … … 173 194 174 195 /* AES-CCM */ 175 #ifdef HAVE_AESCCM 176 void wc_AesCcmSetKey(Aes* aes, const byte* key, word32 keySz) 196 #if defined(HAVE_AESCCM) && \ 197 defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2) 198 int wc_AesCcmSetKey(Aes* aes, const byte* key, word32 keySz) 177 199 { 178 AesCcmSetKey(aes, key, keySz);200 return AesCcmSetKey(aes, key, keySz); 179 201 } 180 202 int wc_AesCcmEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz, … … 210 232 } 211 233 #endif /* HAVE_AES_DECRYPT */ 212 #endif /* HAVE_AESCCM */234 #endif /* HAVE_AESCCM && HAVE_FIPS_VERSION 2 */ 213 235 214 236 int wc_AesInit(Aes* aes, void* h, int i) 215 237 { 216 (void)aes; 238 if (aes == NULL) 239 return BAD_FUNC_ARG; 240 217 241 (void)h; 218 242 (void)i; 243 219 244 /* FIPS doesn't support: 220 245 return AesInit(aes, h, i); */ … … 228 253 } 229 254 230 #else /* HAVE_FIPS*/255 #else /* else build without fips, or for FIPS v2 */ 231 256 232 257 … … 244 269 #endif 245 270 246 #ifndef WOLFSSL_ARMASM 271 #if !defined(WOLFSSL_ARMASM) 272 273 #ifdef WOLFSSL_IMX6_CAAM_BLOB 274 /* case of possibly not using hardware acceleration for AES but using key 275 blobs */ 276 #include <wolfssl/wolfcrypt/port/caam/wolfcaam.h> 277 #endif 247 278 248 279 #ifdef DEBUG_AESNI … … 258 289 /* Define AES implementation includes and functions */ 259 290 #if defined(STM32_CRYPTO) 260 /* STM32F2/F4 hardware AES support for CBC, CTR modes */ 261 262 /* CRYPT_AES_GCM starts the IV with 2 */ 263 #define STM32_GCM_IV_START 2 291 /* STM32F2/F4/F7/L4 hardware AES support for ECB, CBC, CTR and GCM modes */ 264 292 265 293 #if defined(WOLFSSL_AES_DIRECT) || defined(HAVE_AESGCM) || defined(HAVE_AESCCM) 294 266 295 static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) 267 296 { … … 269 298 #ifdef WOLFSSL_STM32_CUBEMX 270 299 CRYP_HandleTypeDef hcryp; 271 272 XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef)); 273 switch(aes->rounds) { 274 case 10: /* 128-bit key */ 275 hcryp.Init.KeySize = CRYP_KEYSIZE_128B; 276 break; 277 case 12: /* 192-bit key */ 278 hcryp.Init.KeySize = CRYP_KEYSIZE_192B; 279 break; 280 case 14: /* 256-bit key */ 281 hcryp.Init.KeySize = CRYP_KEYSIZE_256B; 282 break; 283 default: 284 break; 285 } 286 hcryp.Instance = CRYP; 287 hcryp.Init.DataType = CRYP_DATATYPE_8B; 288 hcryp.Init.pKey = (uint8_t*)aes->key; 289 300 #else 301 CRYP_InitTypeDef cryptInit; 302 CRYP_KeyInitTypeDef keyInit; 303 #endif 304 305 #ifdef WOLFSSL_STM32_CUBEMX 306 ret = wc_Stm32_Aes_Init(aes, &hcryp); 307 if (ret != 0) 308 return ret; 309 310 #ifdef STM32_CRYPTO_AES_ONLY 311 hcryp.Init.OperatingMode = CRYP_ALGOMODE_ENCRYPT; 312 hcryp.Init.ChainingMode = CRYP_CHAINMODE_AES_ECB; 313 hcryp.Init.KeyWriteFlag = CRYP_KEY_WRITE_ENABLE; 314 #endif 290 315 HAL_CRYP_Init(&hcryp); 291 316 292 if (HAL_CRYP_AESECB_Encrypt(&hcryp, (uint8_t*)inBlock, AES_BLOCK_SIZE, 293 outBlock, STM32_HAL_TIMEOUT) != HAL_OK) { 317 #ifdef STM32_CRYPTO_AES_ONLY 318 ret = HAL_CRYPEx_AES(&hcryp, (uint8_t*)inBlock, AES_BLOCK_SIZE, 319 outBlock, STM32_HAL_TIMEOUT); 320 #else 321 ret = HAL_CRYP_AESECB_Encrypt(&hcryp, (uint8_t*)inBlock, AES_BLOCK_SIZE, 322 outBlock, STM32_HAL_TIMEOUT); 323 #endif 324 if (ret != HAL_OK) { 294 325 ret = WC_TIMEOUT_E; 295 326 } 296 297 327 HAL_CRYP_DeInit(&hcryp); 298 #else 299 word32 *enc_key; 300 CRYP_InitTypeDef AES_CRYP_InitStructure; 301 CRYP_KeyInitTypeDef AES_CRYP_KeyInitStructure; 302 303 enc_key = aes->key; 304 305 /* crypto structure initialization */ 306 CRYP_KeyStructInit(&AES_CRYP_KeyInitStructure); 307 CRYP_StructInit(&AES_CRYP_InitStructure); 328 329 #else /* STD_PERI_LIB */ 330 ret = wc_Stm32_Aes_Init(aes, &cryptInit, &keyInit); 331 if (ret != 0) 332 return ret; 308 333 309 334 /* reset registers to their default values */ 310 335 CRYP_DeInit(); 311 336 312 /* load key into correct registers */ 313 switch (aes->rounds) { 314 case 10: /* 128-bit key */ 315 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_128b; 316 AES_CRYP_KeyInitStructure.CRYP_Key2Left = enc_key[0]; 317 AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[1]; 318 AES_CRYP_KeyInitStructure.CRYP_Key3Left = enc_key[2]; 319 AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[3]; 320 break; 321 322 case 12: /* 192-bit key */ 323 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_192b; 324 AES_CRYP_KeyInitStructure.CRYP_Key1Left = enc_key[0]; 325 AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[1]; 326 AES_CRYP_KeyInitStructure.CRYP_Key2Left = enc_key[2]; 327 AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[3]; 328 AES_CRYP_KeyInitStructure.CRYP_Key3Left = enc_key[4]; 329 AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[5]; 330 break; 331 332 case 14: /* 256-bit key */ 333 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_256b; 334 AES_CRYP_KeyInitStructure.CRYP_Key0Left = enc_key[0]; 335 AES_CRYP_KeyInitStructure.CRYP_Key0Right = enc_key[1]; 336 AES_CRYP_KeyInitStructure.CRYP_Key1Left = enc_key[2]; 337 AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[3]; 338 AES_CRYP_KeyInitStructure.CRYP_Key2Left = enc_key[4]; 339 AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[5]; 340 AES_CRYP_KeyInitStructure.CRYP_Key3Left = enc_key[6]; 341 AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[7]; 342 break; 343 344 default: 345 break; 346 } 347 CRYP_KeyInit(&AES_CRYP_KeyInitStructure); 348 349 /* set direction, mode, and datatype */ 350 AES_CRYP_InitStructure.CRYP_AlgoDir = CRYP_AlgoDir_Encrypt; 351 AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_ECB; 352 AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_8b; 353 CRYP_Init(&AES_CRYP_InitStructure); 337 /* setup key */ 338 CRYP_KeyInit(&keyInit); 339 340 /* set direction and mode */ 341 cryptInit.CRYP_AlgoDir = CRYP_AlgoDir_Encrypt; 342 cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_ECB; 343 CRYP_Init(&cryptInit); 354 344 355 345 /* enable crypto processor */ … … 386 376 #ifdef WOLFSSL_STM32_CUBEMX 387 377 CRYP_HandleTypeDef hcryp; 388 389 XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef)); 390 switch(aes->rounds) { 391 case 10: /* 128-bit key */ 392 hcryp.Init.KeySize = CRYP_KEYSIZE_128B; 393 break; 394 case 12: /* 192-bit key */ 395 hcryp.Init.KeySize = CRYP_KEYSIZE_192B; 396 break; 397 case 14: /* 256-bit key */ 398 hcryp.Init.KeySize = CRYP_KEYSIZE_256B; 399 break; 400 default: 401 break; 402 } 403 hcryp.Instance = CRYP; 404 hcryp.Init.DataType = CRYP_DATATYPE_8B; 405 hcryp.Init.pKey = (uint8_t*)aes->key; 406 378 #else 379 CRYP_InitTypeDef cryptInit; 380 CRYP_KeyInitTypeDef keyInit; 381 #endif 382 383 #ifdef WOLFSSL_STM32_CUBEMX 384 ret = wc_Stm32_Aes_Init(aes, &hcryp); 385 if (ret != 0) 386 return ret; 387 388 #ifdef STM32_CRYPTO_AES_ONLY 389 hcryp.Init.OperatingMode = CRYP_ALGOMODE_DECRYPT; 390 hcryp.Init.ChainingMode = CRYP_CHAINMODE_AES_ECB; 391 hcryp.Init.KeyWriteFlag = CRYP_KEY_WRITE_ENABLE; 392 #endif 407 393 HAL_CRYP_Init(&hcryp); 408 394 409 if (HAL_CRYP_AESECB_Decrypt(&hcryp, (uint8_t*)inBlock, AES_BLOCK_SIZE, 410 outBlock, STM32_HAL_TIMEOUT) != HAL_OK) { 395 #ifdef STM32_CRYPTO_AES_ONLY 396 ret = HAL_CRYPEx_AES(&hcryp, (uint8_t*)inBlock, AES_BLOCK_SIZE, 397 outBlock, STM32_HAL_TIMEOUT); 398 #else 399 ret = HAL_CRYP_AESECB_Decrypt(&hcryp, (uint8_t*)inBlock, AES_BLOCK_SIZE, 400 outBlock, STM32_HAL_TIMEOUT) 401 #endif 402 if (ret != HAL_OK) { 411 403 ret = WC_TIMEOUT_E; 412 404 } 413 414 405 HAL_CRYP_DeInit(&hcryp); 415 #else 416 word32 *enc_key; 417 CRYP_InitTypeDef AES_CRYP_InitStructure; 418 CRYP_KeyInitTypeDef AES_CRYP_KeyInitStructure; 419 420 enc_key = aes->key; 421 422 /* crypto structure initialization */ 423 CRYP_KeyStructInit(&AES_CRYP_KeyInitStructure); 424 CRYP_StructInit(&AES_CRYP_InitStructure); 406 407 #else /* STD_PERI_LIB */ 408 ret = wc_Stm32_Aes_Init(aes, &cryptInit, &keyInit); 409 if (ret != 0) 410 return ret; 425 411 426 412 /* reset registers to their default values */ 427 413 CRYP_DeInit(); 428 414 429 /* load key into correct registers */ 430 switch (aes->rounds) { 431 case 10: /* 128-bit key */ 432 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_128b; 433 AES_CRYP_KeyInitStructure.CRYP_Key2Left = enc_key[0]; 434 AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[1]; 435 AES_CRYP_KeyInitStructure.CRYP_Key3Left = enc_key[2]; 436 AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[3]; 437 break; 438 439 case 12: /* 192-bit key */ 440 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_192b; 441 AES_CRYP_KeyInitStructure.CRYP_Key1Left = enc_key[0]; 442 AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[1]; 443 AES_CRYP_KeyInitStructure.CRYP_Key2Left = enc_key[2]; 444 AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[3]; 445 AES_CRYP_KeyInitStructure.CRYP_Key3Left = enc_key[4]; 446 AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[5]; 447 break; 448 449 case 14: /* 256-bit key */ 450 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_256b; 451 AES_CRYP_KeyInitStructure.CRYP_Key0Left = enc_key[0]; 452 AES_CRYP_KeyInitStructure.CRYP_Key0Right = enc_key[1]; 453 AES_CRYP_KeyInitStructure.CRYP_Key1Left = enc_key[2]; 454 AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[3]; 455 AES_CRYP_KeyInitStructure.CRYP_Key2Left = enc_key[4]; 456 AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[5]; 457 AES_CRYP_KeyInitStructure.CRYP_Key3Left = enc_key[6]; 458 AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[7]; 459 break; 460 461 default: 462 break; 463 } 464 CRYP_KeyInit(&AES_CRYP_KeyInitStructure); 465 466 /* set direction, key, and datatype */ 467 AES_CRYP_InitStructure.CRYP_AlgoDir = CRYP_AlgoDir_Decrypt; 468 AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_Key; 469 AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_8b; 470 CRYP_Init(&AES_CRYP_InitStructure); 415 /* set direction and key */ 416 CRYP_KeyInit(&keyInit); 417 cryptInit.CRYP_AlgoDir = CRYP_AlgoDir_Decrypt; 418 cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_Key; 419 CRYP_Init(&cryptInit); 471 420 472 421 /* enable crypto processor */ 473 422 CRYP_Cmd(ENABLE); 474 423 475 /* wait until decrypt key has been in tialized */424 /* wait until decrypt key has been initialized */ 476 425 while (CRYP_GetFlagStatus(CRYP_FLAG_BUSY) != RESET) {} 477 426 478 /* set direction, mode, and datatype */ 479 AES_CRYP_InitStructure.CRYP_AlgoDir = CRYP_AlgoDir_Decrypt; 480 AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_ECB; 481 AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_8b; 482 CRYP_Init(&AES_CRYP_InitStructure); 427 /* set direction and mode */ 428 cryptInit.CRYP_AlgoDir = CRYP_AlgoDir_Decrypt; 429 cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_ECB; 430 CRYP_Init(&cryptInit); 483 431 484 432 /* enable crypto processor */ … … 782 730 #endif /* HAVE_AES_DECRYPT */ 783 731 732 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES) 733 static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) 734 { 735 wc_AesEncryptDirect(aes, outBlock, inBlock); 736 return 0; 737 } 738 739 #elif defined(WOLFSSL_AFALG) 740 #elif defined(WOLFSSL_DEVCRYPTO_AES) 741 /* if all AES is enabled with devcrypto then tables are not needed */ 742 784 743 #else 785 744 786 /* using wolfCrypt software AESimplementation */745 /* using wolfCrypt software implementation */ 787 746 #define NEED_AES_TABLES 788 747 #endif … … 1336 1295 1337 1296 1297 #if defined(HAVE_AES_CBC) || defined(WOLFSSL_AES_DIRECT) 1338 1298 static const byte Td4[256] = 1339 1299 { … … 1371 1331 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU, 1372 1332 }; 1333 #endif /* HAVE_AES_CBC || WOLFSSL_AES_DIRECT */ 1373 1334 #endif /* HAVE_AES_DECRYPT */ 1374 1335 … … 1391 1352 1392 1353 /* load 4 Te Tables into cache by cache line stride */ 1393 static INLINE word32 PreFetchTe(void)1354 static WC_INLINE word32 PreFetchTe(void) 1394 1355 { 1395 1356 word32 x = 0; … … 1405 1366 } 1406 1367 1407 1368 /* Software AES - ECB Encrypt */ 1408 1369 static void wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) 1409 1370 { … … 1415 1376 if (r > 7 || r == 0) { 1416 1377 WOLFSSL_MSG("AesEncrypt encountered improper key, set it up"); 1417 return; /* stop instead of seg faulting, set up your keys! */1378 return; /* stop instead of seg-faulting, set up your keys! */ 1418 1379 } 1419 1380 … … 1594 1555 1595 1556 #if defined(HAVE_AES_DECRYPT) 1596 #if defined(HAVE_AES_CBC) || defined(WOLFSSL_AES_DIRECT) 1557 #if (defined(HAVE_AES_CBC) || defined(WOLFSSL_AES_DIRECT)) && \ 1558 !defined(WOLFSSL_DEVCRYPTO_CBC) 1597 1559 1598 1560 /* load 4 Td Tables into cache by cache line stride */ 1599 static INLINE word32 PreFetchTd(void)1561 static WC_INLINE word32 PreFetchTd(void) 1600 1562 { 1601 1563 word32 x = 0; … … 1612 1574 1613 1575 /* load Td Table4 into cache by cache line stride */ 1614 static INLINE word32 PreFetchTd4(void)1576 static WC_INLINE word32 PreFetchTd4(void) 1615 1577 { 1616 1578 word32 x = 0; … … 1623 1585 } 1624 1586 1587 /* Software AES - ECB Decrypt */ 1625 1588 static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock) 1626 1589 { … … 1632 1595 if (r > 7 || r == 0) { 1633 1596 WOLFSSL_MSG("AesDecrypt encountered improper key, set it up"); 1634 return; /* stop instead of seg faulting, set up your keys! */1597 return; /* stop instead of seg-faulting, set up your keys! */ 1635 1598 } 1636 1599 #ifdef WOLFSSL_AESNI … … 1803 1766 (void)dir; 1804 1767 1805 if (!((keylen == 16) || (keylen == 24) || (keylen == 32))) 1768 if (keylen != 16 && 1769 #ifdef WOLFSSL_AES_192 1770 keylen != 24 && 1771 #endif 1772 keylen != 32) { 1806 1773 return BAD_FUNC_ARG; 1774 } 1807 1775 1808 1776 aes->keylen = keylen; … … 1812 1780 ByteReverseWords(rk, rk, keylen); 1813 1781 #endif 1814 #if def WOLFSSL_AES_COUNTER1782 #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER) 1815 1783 aes->left = 0; 1816 1784 #endif … … 1885 1853 XMEMCPY(aes->reg, iv, AES_BLOCK_SIZE); 1886 1854 1855 #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER) 1856 aes->left = 0; 1857 #endif 1858 1887 1859 return 0; 1888 1860 } … … 1891 1863 int dir) 1892 1864 { 1893 if ( !((keylen == 16) || (keylen == 24) || (keylen == 32)))1865 if (aes == NULL || !((keylen == 16) || (keylen == 24) || (keylen == 32))) 1894 1866 return BAD_FUNC_ARG; 1895 1867 1896 1868 aes->rounds = keylen/4 + 6; 1897 1869 XMEMCPY(aes->key, userKey, keylen); 1898 #ifdef WOLFSSL_AES_COUNTER 1870 1871 #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER) 1899 1872 aes->left = 0; 1900 1873 #endif … … 1923 1896 return BAD_FUNC_ARG; 1924 1897 1925 #if def WOLFSSL_AES_COUNTER1898 #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER) 1926 1899 aes->left = 0; 1927 1900 #endif 1928 aes->keylen = keylen; 1901 1929 1902 aes->rounds = keylen/4 + 6; 1930 1903 … … 1966 1939 ret = nrf51_aes_set_key(userKey); 1967 1940 1941 #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER) 1942 aes->left = 0; 1943 #endif 1944 1968 1945 return ret; 1969 1946 } … … 1975 1952 } 1976 1953 1954 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES) 1955 /* implemented in wolfcrypt/src/port/caam/caam_aes.c */ 1956 1957 #elif defined(WOLFSSL_AFALG) 1958 /* implemented in wolfcrypt/src/port/af_alg/afalg_aes.c */ 1959 1960 #elif defined(WOLFSSL_DEVCRYPTO_AES) 1961 /* implemented in wolfcrypt/src/port/devcrypto/devcrypto_aes.c */ 1962 1977 1963 #else 1964 1965 /* Software AES - SetKey */ 1978 1966 static int wc_AesSetKeyLocal(Aes* aes, const byte* userKey, word32 keylen, 1979 1967 const byte* iv, int dir) … … 1988 1976 aes->use_aesni = 0; 1989 1977 #endif /* WOLFSSL_AESNI */ 1990 #ifdef WOLFSSL_AES_COUNTER1978 #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER) 1991 1979 aes->left = 0; 1992 #endif /* WOLFSSL_AES_COUNTER */1980 #endif 1993 1981 1994 1982 aes->keylen = keylen; … … 2003 1991 2004 1992 switch (keylen) { 2005 #if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 128 1993 #if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 128 && \ 1994 defined(WOLFSSL_AES_128) 2006 1995 case 16: 2007 1996 while (1) … … 2024 2013 #endif /* 128 */ 2025 2014 2026 #if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 192 2015 #if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 192 && \ 2016 defined(WOLFSSL_AES_192) 2027 2017 case 24: 2028 2018 /* for (;;) here triggers a bug in VC60 SP4 w/ Pro Pack */ … … 2048 2038 #endif /* 192 */ 2049 2039 2050 #if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 256 2040 #if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 256 && \ 2041 defined(WOLFSSL_AES_256) 2051 2042 case 32: 2052 2043 while (1) … … 2132 2123 const byte* iv, int dir) 2133 2124 { 2125 int ret; 2134 2126 #if defined(AES_MAX_KEY_SIZE) 2135 2127 const word32 max_key_len = (AES_MAX_KEY_SIZE / 8); 2136 2128 #endif 2137 2129 2130 #ifdef WOLFSSL_IMX6_CAAM_BLOB 2131 byte local[32]; 2132 word32 localSz = 32; 2133 2134 if (keylen == (16 + WC_CAAM_BLOB_SZ) || 2135 keylen == (24 + WC_CAAM_BLOB_SZ) || 2136 keylen == (32 + WC_CAAM_BLOB_SZ)) { 2137 if (wc_caamOpenBlob((byte*)userKey, keylen, local, &localSz) != 0) { 2138 return BAD_FUNC_ARG; 2139 } 2140 2141 /* set local values */ 2142 userKey = local; 2143 keylen = localSz; 2144 } 2145 #endif 2138 2146 if (aes == NULL || 2139 2147 !((keylen == 16) || (keylen == 24) || (keylen == 32))) { … … 2164 2172 } 2165 2173 if (haveAESNI) { 2166 #if def WOLFSSL_AES_COUNTER2174 #if defined(WOLFSSL_AES_COUNTER) || defined(WOLFSSL_AES_CFB) 2167 2175 aes->left = 0; 2168 2176 #endif /* WOLFSSL_AES_COUNTER */ … … 2179 2187 #endif /* WOLFSSL_AESNI */ 2180 2188 2181 return wc_AesSetKeyLocal(aes, userKey, keylen, iv, dir); 2189 ret = wc_AesSetKeyLocal(aes, userKey, keylen, iv, dir); 2190 2191 #if defined(WOLFSSL_DEVCRYPTO) && \ 2192 (defined(WOLFSSL_DEVCRYPTO_AES) || defined(WOLFSSL_DEVCRYPTO_CBC)) 2193 aes->ctx.cfd = -1; 2194 XMEMCPY(aes->devKey, userKey, keylen); 2195 #endif 2196 #ifdef WOLFSSL_IMX6_CAAM_BLOB 2197 ForceZero(local, sizeof(local)); 2198 #endif 2199 return ret; 2182 2200 } 2183 2201 … … 2187 2205 const byte* iv, int dir) 2188 2206 { 2189 return wc_AesSetKeyLocal(aes, userKey, keylen, iv, dir); 2207 int ret; 2208 2209 #ifdef WOLFSSL_IMX6_CAAM_BLOB 2210 byte local[32]; 2211 word32 localSz = 32; 2212 2213 if (keylen == (16 + WC_CAAM_BLOB_SZ) || 2214 keylen == (24 + WC_CAAM_BLOB_SZ) || 2215 keylen == (32 + WC_CAAM_BLOB_SZ)) { 2216 if (wc_caamOpenBlob((byte*)userKey, keylen, local, &localSz) 2217 != 0) { 2218 return BAD_FUNC_ARG; 2219 } 2220 2221 /* set local values */ 2222 userKey = local; 2223 keylen = localSz; 2224 } 2225 #endif 2226 ret = wc_AesSetKeyLocal(aes, userKey, keylen, iv, dir); 2227 2228 #ifdef WOLFSSL_IMX6_CAAM_BLOB 2229 ForceZero(local, sizeof(local)); 2230 #endif 2231 2232 return ret; 2190 2233 } 2191 2234 #endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ … … 2239 2282 } 2240 2283 2284 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES) 2285 /* implemented in wolfcrypt/src/port/caam/caam_aes.c */ 2286 2287 #elif defined(WOLFSSL_AFALG) 2288 /* implemented in wolfcrypt/src/port/af_alg/afalg_aes.c */ 2289 2290 #elif defined(WOLFSSL_DEVCRYPTO_AES) 2291 /* implemented in wolfcrypt/src/port/devcrypt/devcrypto_aes.c */ 2292 2241 2293 #else 2242 2294 /* Allow direct access to one block encrypt */ … … 2267 2319 CRYP_HandleTypeDef hcryp; 2268 2320 2269 XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef)); 2270 switch (aes->rounds) { 2271 case 10: /* 128-bit key */ 2272 hcryp.Init.KeySize = CRYP_KEYSIZE_128B; 2273 break; 2274 case 12: /* 192-bit key */ 2275 hcryp.Init.KeySize = CRYP_KEYSIZE_192B; 2276 break; 2277 case 14: /* 256-bit key */ 2278 hcryp.Init.KeySize = CRYP_KEYSIZE_256B; 2279 break; 2280 default: 2281 break; 2282 } 2283 hcryp.Instance = CRYP; 2284 hcryp.Init.DataType = CRYP_DATATYPE_8B; 2285 hcryp.Init.pKey = (uint8_t*)aes->key; 2321 ret = wc_Stm32_Aes_Init(aes, &hcryp); 2322 if (ret != 0) 2323 return ret; 2324 2325 #ifdef STM32_CRYPTO_AES_ONLY 2326 hcryp.Init.OperatingMode = CRYP_ALGOMODE_ENCRYPT; 2327 hcryp.Init.ChainingMode = CRYP_CHAINMODE_AES_CBC; 2328 hcryp.Init.KeyWriteFlag = CRYP_KEY_WRITE_ENABLE; 2329 #endif 2286 2330 hcryp.Init.pInitVect = (uint8_t*)aes->reg; 2287 2288 2331 HAL_CRYP_Init(&hcryp); 2289 2332 2290 2333 while (blocks--) { 2291 if (HAL_CRYP_AESCBC_Encrypt(&hcryp, (uint8_t*)in, AES_BLOCK_SIZE, 2292 out, STM32_HAL_TIMEOUT) != HAL_OK) { 2334 #ifdef STM32_CRYPTO_AES_ONLY 2335 ret = HAL_CRYPEx_AES(&hcryp, (uint8_t*)in, AES_BLOCK_SIZE, 2336 out, STM32_HAL_TIMEOUT); 2337 #else 2338 ret = HAL_CRYP_AESCBC_Encrypt(&hcryp, (uint8_t*)in, AES_BLOCK_SIZE, 2339 out, STM32_HAL_TIMEOUT); 2340 #endif 2341 if (ret != HAL_OK) { 2293 2342 ret = WC_TIMEOUT_E; 2294 2343 break; … … 2314 2363 CRYP_HandleTypeDef hcryp; 2315 2364 2316 XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef)); 2317 switch (aes->rounds) { 2318 case 10: /* 128-bit key */ 2319 hcryp.Init.KeySize = CRYP_KEYSIZE_128B; 2365 ret = wc_Stm32_Aes_Init(aes, &hcryp); 2366 if (ret != 0) 2367 return ret; 2368 2369 /* if input and output same will overwrite input iv */ 2370 XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); 2371 2372 #ifdef STM32_CRYPTO_AES_ONLY 2373 hcryp.Init.OperatingMode = CRYP_ALGOMODE_KEYDERIVATION_DECRYPT; 2374 hcryp.Init.ChainingMode = CRYP_CHAINMODE_AES_CBC; 2375 hcryp.Init.KeyWriteFlag = CRYP_KEY_WRITE_ENABLE; 2376 #endif 2377 2378 hcryp.Init.pInitVect = (uint8_t*)aes->reg; 2379 HAL_CRYP_Init(&hcryp); 2380 2381 while (blocks--) { 2382 #ifdef STM32_CRYPTO_AES_ONLY 2383 ret = HAL_CRYPEx_AES(&hcryp, (uint8_t*)in, AES_BLOCK_SIZE, 2384 out, STM32_HAL_TIMEOUT); 2385 #else 2386 ret = HAL_CRYP_AESCBC_Decrypt(&hcryp, (uint8_t*)in, AES_BLOCK_SIZE, 2387 out, STM32_HAL_TIMEOUT); 2388 #endif 2389 if (ret != HAL_OK) { 2390 ret = WC_TIMEOUT_E; 2320 2391 break; 2321 case 12: /* 192-bit key */2322 hcryp.Init.KeySize = CRYP_KEYSIZE_192B;2323 break;2324 case 14: /* 256-bit key */2325 hcryp.Init.KeySize = CRYP_KEYSIZE_256B;2326 break;2327 default:2328 break;2329 }2330 hcryp.Instance = CRYP;2331 hcryp.Init.DataType = CRYP_DATATYPE_8B;2332 hcryp.Init.pKey = (uint8_t*)aes->key;2333 hcryp.Init.pInitVect = (uint8_t*)aes->reg;2334 2335 HAL_CRYP_Init(&hcryp);2336 2337 while (blocks--) {2338 if (HAL_CRYP_AESCBC_Decrypt(&hcryp, (uint8_t*)in, AES_BLOCK_SIZE,2339 out, STM32_HAL_TIMEOUT) != HAL_OK) {2340 ret = WC_TIMEOUT_E;2341 2392 } 2342 2393 … … 2353 2404 } 2354 2405 #endif /* HAVE_AES_DECRYPT */ 2355 #else 2406 2407 #else /* STD_PERI_LIB */ 2356 2408 int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) 2357 2409 { 2358 word32 *enc_key, *iv; 2410 int ret; 2411 word32 *iv; 2359 2412 word32 blocks = (sz / AES_BLOCK_SIZE); 2360 CRYP_InitTypeDef AES_CRYP_InitStructure; 2361 CRYP_KeyInitTypeDef AES_CRYP_KeyInitStructure; 2362 CRYP_IVInitTypeDef AES_CRYP_IVInitStructure; 2363 2364 enc_key = aes->key; 2365 iv = aes->reg; 2366 2367 /* crypto structure initialization */ 2368 CRYP_KeyStructInit(&AES_CRYP_KeyInitStructure); 2369 CRYP_StructInit(&AES_CRYP_InitStructure); 2370 CRYP_IVStructInit(&AES_CRYP_IVInitStructure); 2413 CRYP_InitTypeDef cryptInit; 2414 CRYP_KeyInitTypeDef keyInit; 2415 CRYP_IVInitTypeDef ivInit; 2416 2417 ret = wc_Stm32_Aes_Init(aes, &cryptInit, &keyInit); 2418 if (ret != 0) 2419 return ret; 2371 2420 2372 2421 /* reset registers to their default values */ 2373 2422 CRYP_DeInit(); 2374 2423 2375 /* load key into correct registers */ 2376 switch (aes->rounds) { 2377 case 10: /* 128-bit key */ 2378 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_128b; 2379 AES_CRYP_KeyInitStructure.CRYP_Key2Left = enc_key[0]; 2380 AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[1]; 2381 AES_CRYP_KeyInitStructure.CRYP_Key3Left = enc_key[2]; 2382 AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[3]; 2383 break; 2384 2385 case 12: /* 192-bit key */ 2386 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_192b; 2387 AES_CRYP_KeyInitStructure.CRYP_Key1Left = enc_key[0]; 2388 AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[1]; 2389 AES_CRYP_KeyInitStructure.CRYP_Key2Left = enc_key[2]; 2390 AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[3]; 2391 AES_CRYP_KeyInitStructure.CRYP_Key3Left = enc_key[4]; 2392 AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[5]; 2393 break; 2394 2395 case 14: /* 256-bit key */ 2396 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_256b; 2397 AES_CRYP_KeyInitStructure.CRYP_Key0Left = enc_key[0]; 2398 AES_CRYP_KeyInitStructure.CRYP_Key0Right = enc_key[1]; 2399 AES_CRYP_KeyInitStructure.CRYP_Key1Left = enc_key[2]; 2400 AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[3]; 2401 AES_CRYP_KeyInitStructure.CRYP_Key2Left = enc_key[4]; 2402 AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[5]; 2403 AES_CRYP_KeyInitStructure.CRYP_Key3Left = enc_key[6]; 2404 AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[7]; 2405 break; 2406 2407 default: 2408 break; 2409 } 2410 CRYP_KeyInit(&AES_CRYP_KeyInitStructure); 2424 /* set key */ 2425 CRYP_KeyInit(&keyInit); 2411 2426 2412 2427 /* set iv */ 2428 iv = aes->reg; 2429 CRYP_IVStructInit(&ivInit); 2413 2430 ByteReverseWords(iv, iv, AES_BLOCK_SIZE); 2414 AES_CRYP_IVInitStructure.CRYP_IV0Left = iv[0]; 2415 AES_CRYP_IVInitStructure.CRYP_IV0Right = iv[1]; 2416 AES_CRYP_IVInitStructure.CRYP_IV1Left = iv[2]; 2417 AES_CRYP_IVInitStructure.CRYP_IV1Right = iv[3]; 2418 CRYP_IVInit(&AES_CRYP_IVInitStructure); 2419 2420 /* set direction, mode, and datatype */ 2421 AES_CRYP_InitStructure.CRYP_AlgoDir = CRYP_AlgoDir_Encrypt; 2422 AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_CBC; 2423 AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_8b; 2424 CRYP_Init(&AES_CRYP_InitStructure); 2431 ivInit.CRYP_IV0Left = iv[0]; 2432 ivInit.CRYP_IV0Right = iv[1]; 2433 ivInit.CRYP_IV1Left = iv[2]; 2434 ivInit.CRYP_IV1Right = iv[3]; 2435 CRYP_IVInit(&ivInit); 2436 2437 /* set direction and mode */ 2438 cryptInit.CRYP_AlgoDir = CRYP_AlgoDir_Encrypt; 2439 cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_CBC; 2440 CRYP_Init(&cryptInit); 2425 2441 2426 2442 /* enable crypto processor */ … … 2455 2471 CRYP_Cmd(DISABLE); 2456 2472 2457 return 0;2473 return ret; 2458 2474 } 2459 2475 … … 2461 2477 int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz) 2462 2478 { 2463 word32 *dec_key, *iv; 2479 int ret; 2480 word32 *iv; 2464 2481 word32 blocks = (sz / AES_BLOCK_SIZE); 2465 CRYP_InitTypeDef AES_CRYP_InitStructure; 2466 CRYP_KeyInitTypeDef AES_CRYP_KeyInitStructure; 2467 CRYP_IVInitTypeDef AES_CRYP_IVInitStructure; 2468 2469 dec_key = aes->key; 2470 iv = aes->reg; 2471 2472 /* crypto structure initialization */ 2473 CRYP_KeyStructInit(&AES_CRYP_KeyInitStructure); 2474 CRYP_StructInit(&AES_CRYP_InitStructure); 2475 CRYP_IVStructInit(&AES_CRYP_IVInitStructure); 2482 CRYP_InitTypeDef cryptInit; 2483 CRYP_KeyInitTypeDef keyInit; 2484 CRYP_IVInitTypeDef ivInit; 2485 2486 ret = wc_Stm32_Aes_Init(aes, &cryptInit, &keyInit); 2487 if (ret != 0) 2488 return ret; 2476 2489 2477 2490 /* if input and output same will overwrite input iv */ … … 2481 2494 CRYP_DeInit(); 2482 2495 2483 /* load key into correct registers */ 2484 switch (aes->rounds) { 2485 case 10: /* 128-bit key */ 2486 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_128b; 2487 AES_CRYP_KeyInitStructure.CRYP_Key2Left = dec_key[0]; 2488 AES_CRYP_KeyInitStructure.CRYP_Key2Right = dec_key[1]; 2489 AES_CRYP_KeyInitStructure.CRYP_Key3Left = dec_key[2]; 2490 AES_CRYP_KeyInitStructure.CRYP_Key3Right = dec_key[3]; 2491 break; 2492 2493 case 12: /* 192-bit key */ 2494 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_192b; 2495 AES_CRYP_KeyInitStructure.CRYP_Key1Left = dec_key[0]; 2496 AES_CRYP_KeyInitStructure.CRYP_Key1Right = dec_key[1]; 2497 AES_CRYP_KeyInitStructure.CRYP_Key2Left = dec_key[2]; 2498 AES_CRYP_KeyInitStructure.CRYP_Key2Right = dec_key[3]; 2499 AES_CRYP_KeyInitStructure.CRYP_Key3Left = dec_key[4]; 2500 AES_CRYP_KeyInitStructure.CRYP_Key3Right = dec_key[5]; 2501 break; 2502 2503 case 14: /* 256-bit key */ 2504 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_256b; 2505 AES_CRYP_KeyInitStructure.CRYP_Key0Left = dec_key[0]; 2506 AES_CRYP_KeyInitStructure.CRYP_Key0Right = dec_key[1]; 2507 AES_CRYP_KeyInitStructure.CRYP_Key1Left = dec_key[2]; 2508 AES_CRYP_KeyInitStructure.CRYP_Key1Right = dec_key[3]; 2509 AES_CRYP_KeyInitStructure.CRYP_Key2Left = dec_key[4]; 2510 AES_CRYP_KeyInitStructure.CRYP_Key2Right = dec_key[5]; 2511 AES_CRYP_KeyInitStructure.CRYP_Key3Left = dec_key[6]; 2512 AES_CRYP_KeyInitStructure.CRYP_Key3Right = dec_key[7]; 2513 break; 2514 2515 default: 2516 break; 2517 } 2518 2519 /* set direction, mode, and datatype for key preparation */ 2520 AES_CRYP_InitStructure.CRYP_AlgoDir = CRYP_AlgoDir_Decrypt; 2521 AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_Key; 2522 AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_32b; 2523 CRYP_Init(&AES_CRYP_InitStructure); 2524 CRYP_KeyInit(&AES_CRYP_KeyInitStructure); 2496 /* set direction and key */ 2497 CRYP_KeyInit(&keyInit); 2498 cryptInit.CRYP_AlgoDir = CRYP_AlgoDir_Decrypt; 2499 cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_Key; 2500 CRYP_Init(&cryptInit); 2525 2501 2526 2502 /* enable crypto processor */ … … 2530 2506 while (CRYP_GetFlagStatus(CRYP_FLAG_BUSY) != RESET) {} 2531 2507 2532 /* set direction, mode, and datatype for decryption */ 2533 AES_CRYP_InitStructure.CRYP_AlgoDir = CRYP_AlgoDir_Decrypt; 2534 AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_CBC; 2535 AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_8b; 2536 CRYP_Init(&AES_CRYP_InitStructure); 2508 /* set direction and mode */ 2509 cryptInit.CRYP_AlgoDir = CRYP_AlgoDir_Decrypt; 2510 cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_CBC; 2511 CRYP_Init(&cryptInit); 2537 2512 2538 2513 /* set iv */ 2514 iv = aes->reg; 2515 CRYP_IVStructInit(&ivInit); 2539 2516 ByteReverseWords(iv, iv, AES_BLOCK_SIZE); 2540 2541 AES_CRYP_IVInitStructure.CRYP_IV0Left = iv[0]; 2542 AES_CRYP_IVInitStructure.CRYP_IV0Right = iv[1]; 2543 AES_CRYP_IVInitStructure.CRYP_IV1Left = iv[2]; 2544 AES_CRYP_IVInitStructure.CRYP_IV1Right = iv[3]; 2545 CRYP_IVInit(&AES_CRYP_IVInitStructure); 2517 ivInit.CRYP_IV0Left = iv[0]; 2518 ivInit.CRYP_IV0Right = iv[1]; 2519 ivInit.CRYP_IV1Left = iv[2]; 2520 ivInit.CRYP_IV1Right = iv[3]; 2521 CRYP_IVInit(&ivInit); 2546 2522 2547 2523 /* enable crypto processor */ … … 2575 2551 CRYP_Cmd(DISABLE); 2576 2552 2577 return 0;2553 return ret; 2578 2554 } 2579 2555 #endif /* HAVE_AES_DECRYPT */ … … 2704 2680 status = LTC_AES_EncryptCbc(LTC_BASE, in, out, blocks * AES_BLOCK_SIZE, 2705 2681 iv, enc_key, keySize); 2682 2683 /* store iv for next call */ 2684 if (status == kStatus_Success) { 2685 XMEMCPY(iv, out + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); 2686 } 2687 2706 2688 return (status == kStatus_Success) ? 0 : -1; 2707 2689 } … … 2714 2696 byte* iv, *dec_key; 2715 2697 word32 blocks = (sz / AES_BLOCK_SIZE); 2698 byte temp_block[AES_BLOCK_SIZE]; 2716 2699 2717 2700 iv = (byte*)aes->reg; … … 2723 2706 } 2724 2707 2708 /* get IV for next call */ 2709 XMEMCPY(temp_block, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); 2710 2725 2711 status = LTC_AES_DecryptCbc(LTC_BASE, in, out, blocks * AES_BLOCK_SIZE, 2726 2712 iv, dec_key, keySize, kLTC_EncryptKey); 2713 2714 /* store IV for next call */ 2715 if (status == kStatus_Success) { 2716 XMEMCPY(iv, temp_block, AES_BLOCK_SIZE); 2717 } 2718 2727 2719 return (status == kStatus_Success) ? 0 : -1; 2728 2720 } … … 2791 2783 int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) 2792 2784 { 2793 return wc_Pic32AesCrypt( 2785 int ret; 2786 2787 /* hardware fails on input that is not a multiple of AES block size */ 2788 if (sz % AES_BLOCK_SIZE != 0) { 2789 return BAD_FUNC_ARG; 2790 } 2791 2792 ret = wc_Pic32AesCrypt( 2794 2793 aes->key, aes->keylen, aes->reg, AES_BLOCK_SIZE, 2795 2794 out, in, sz, PIC32_ENCRYPTION, 2796 2795 PIC32_ALGO_AES, PIC32_CRYPTOALGO_RCBC); 2796 2797 /* store iv for next call */ 2798 if (ret == 0) { 2799 XMEMCPY(aes->reg, out + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); 2800 } 2801 2802 return ret; 2797 2803 } 2798 2804 #ifdef HAVE_AES_DECRYPT 2799 2805 int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz) 2800 2806 { 2801 return wc_Pic32AesCrypt( 2807 int ret; 2808 byte scratch[AES_BLOCK_SIZE]; 2809 2810 /* hardware fails on input that is not a multiple of AES block size */ 2811 if (sz % AES_BLOCK_SIZE != 0) { 2812 return BAD_FUNC_ARG; 2813 } 2814 XMEMCPY(scratch, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); 2815 2816 ret = wc_Pic32AesCrypt( 2802 2817 aes->key, aes->keylen, aes->reg, AES_BLOCK_SIZE, 2803 2818 out, in, sz, PIC32_DECRYPTION, 2804 2819 PIC32_ALGO_AES, PIC32_CRYPTOALGO_RCBC); 2820 2821 /* store iv for next call */ 2822 if (ret == 0) { 2823 XMEMCPY((byte*)aes->reg, scratch, AES_BLOCK_SIZE); 2824 } 2825 2826 return ret; 2805 2827 } 2806 2828 #endif /* HAVE_AES_DECRYPT */ 2807 2829 2830 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES) 2831 /* implemented in wolfcrypt/src/port/caam/caam_aes.c */ 2832 2833 #elif defined(WOLFSSL_AFALG) 2834 /* implemented in wolfcrypt/src/port/af_alg/afalg_aes.c */ 2835 2836 #elif defined(WOLFSSL_DEVCRYPTO_CBC) 2837 /* implemented in wolfcrypt/src/port/devcrypt/devcrypto_aes.c */ 2838 2808 2839 #else 2809 2840 2841 /* Software AES - CBC Encrypt */ 2810 2842 int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) 2811 2843 { … … 2897 2929 2898 2930 #ifdef HAVE_AES_DECRYPT 2931 /* Software AES - CBC Decrypt */ 2899 2932 int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz) 2900 2933 { … … 2977 3010 #endif /* HAVE_AES_CBC */ 2978 3011 2979 #ifdef HAVE_AES_ECB2980 int wc_AesEcbEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)2981 {2982 if ((in == NULL) || (out == NULL) || (aes == NULL))2983 return BAD_FUNC_ARG;2984 while (sz>0) {2985 wc_AesEncryptDirect(aes, out, in);2986 out += AES_BLOCK_SIZE;2987 in += AES_BLOCK_SIZE;2988 sz -= AES_BLOCK_SIZE;2989 }2990 return 0;2991 }2992 int wc_AesEcbDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)2993 {2994 if ((in == NULL) || (out == NULL) || (aes == NULL))2995 return BAD_FUNC_ARG;2996 while (sz>0) {2997 wc_AesDecryptDirect(aes, out, in);2998 out += AES_BLOCK_SIZE;2999 in += AES_BLOCK_SIZE;3000 sz -= AES_BLOCK_SIZE;3001 }3002 return 0;3003 }3004 #endif3005 3006 3012 /* AES-CTR */ 3007 3013 #if defined(WOLFSSL_AES_COUNTER) … … 3016 3022 #ifdef WOLFSSL_STM32_CUBEMX 3017 3023 CRYP_HandleTypeDef hcryp; 3018 3019 XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef)); 3020 switch (aes->rounds) { 3021 case 10: /* 128-bit key */ 3022 hcryp.Init.KeySize = CRYP_KEYSIZE_128B; 3023 break; 3024 case 12: /* 192-bit key */ 3025 hcryp.Init.KeySize = CRYP_KEYSIZE_192B; 3026 break; 3027 case 14: /* 256-bit key */ 3028 hcryp.Init.KeySize = CRYP_KEYSIZE_256B; 3029 break; 3030 default: 3031 break; 3032 } 3033 hcryp.Instance = CRYP; 3034 hcryp.Init.DataType = CRYP_DATATYPE_8B; 3035 hcryp.Init.pKey = (byte*)aes->key; 3036 hcryp.Init.pInitVect = (byte*)aes->reg; 3037 3024 #else 3025 word32 *iv; 3026 CRYP_InitTypeDef cryptInit; 3027 CRYP_KeyInitTypeDef keyInit; 3028 CRYP_IVInitTypeDef ivInit; 3029 #endif 3030 3031 #ifdef WOLFSSL_STM32_CUBEMX 3032 ret = wc_Stm32_Aes_Init(aes, &hcryp); 3033 if (ret != 0) 3034 return ret; 3035 3036 #ifdef STM32_CRYPTO_AES_ONLY 3037 hcryp.Init.OperatingMode = CRYP_ALGOMODE_ENCRYPT; 3038 hcryp.Init.ChainingMode = CRYP_CHAINMODE_AES_CTR; 3039 hcryp.Init.KeyWriteFlag = CRYP_KEY_WRITE_ENABLE; 3040 #endif 3041 hcryp.Init.pInitVect = (uint8_t*)aes->reg; 3038 3042 HAL_CRYP_Init(&hcryp); 3039 3043 3040 if (HAL_CRYP_AESCTR_Encrypt(&hcryp, (byte*)in, AES_BLOCK_SIZE, out, 3041 STM32_HAL_TIMEOUT) != HAL_OK) { 3042 /* failed */ 3044 #ifdef STM32_CRYPTO_AES_ONLY 3045 ret = HAL_CRYPEx_AES(&hcryp, (byte*)in, AES_BLOCK_SIZE, 3046 out, STM32_HAL_TIMEOUT); 3047 #else 3048 ret = HAL_CRYP_AESCTR_Encrypt(&hcryp, (byte*)in, AES_BLOCK_SIZE, 3049 out, STM32_HAL_TIMEOUT); 3050 #endif 3051 if (ret != HAL_OK) { 3043 3052 ret = WC_TIMEOUT_E; 3044 3053 } 3045 3046 3054 HAL_CRYP_DeInit(&hcryp); 3047 3055 3048 3056 #else /* STD_PERI_LIB */ 3049 word32 *enc_key, *iv; 3050 CRYP_InitTypeDef AES_CRYP_InitStructure; 3051 CRYP_KeyInitTypeDef AES_CRYP_KeyInitStructure; 3052 CRYP_IVInitTypeDef AES_CRYP_IVInitStructure; 3053 3054 enc_key = aes->key; 3055 iv = aes->reg; 3056 3057 /* crypto structure initialization */ 3058 CRYP_KeyStructInit(&AES_CRYP_KeyInitStructure); 3059 CRYP_StructInit(&AES_CRYP_InitStructure); 3060 CRYP_IVStructInit(&AES_CRYP_IVInitStructure); 3057 ret = wc_Stm32_Aes_Init(aes, &cryptInit, &keyInit); 3058 if (ret != 0) 3059 return ret; 3061 3060 3062 3061 /* reset registers to their default values */ 3063 3062 CRYP_DeInit(); 3064 3063 3065 /* load key into correct registers */ 3066 switch (aes->rounds) { 3067 case 10: /* 128-bit key */ 3068 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_128b; 3069 AES_CRYP_KeyInitStructure.CRYP_Key2Left = enc_key[0]; 3070 AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[1]; 3071 AES_CRYP_KeyInitStructure.CRYP_Key3Left = enc_key[2]; 3072 AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[3]; 3073 break; 3074 case 12: /* 192-bit key */ 3075 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_192b; 3076 AES_CRYP_KeyInitStructure.CRYP_Key1Left = enc_key[0]; 3077 AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[1]; 3078 AES_CRYP_KeyInitStructure.CRYP_Key2Left = enc_key[2]; 3079 AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[3]; 3080 AES_CRYP_KeyInitStructure.CRYP_Key3Left = enc_key[4]; 3081 AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[5]; 3082 break; 3083 case 14: /* 256-bit key */ 3084 AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_256b; 3085 AES_CRYP_KeyInitStructure.CRYP_Key0Left = enc_key[0]; 3086 AES_CRYP_KeyInitStructure.CRYP_Key0Right = enc_key[1]; 3087 AES_CRYP_KeyInitStructure.CRYP_Key1Left = enc_key[2]; 3088 AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[3]; 3089 AES_CRYP_KeyInitStructure.CRYP_Key2Left = enc_key[4]; 3090 AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[5]; 3091 AES_CRYP_KeyInitStructure.CRYP_Key3Left = enc_key[6]; 3092 AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[7]; 3093 break; 3094 default: 3095 break; 3096 } 3097 CRYP_KeyInit(&AES_CRYP_KeyInitStructure); 3064 /* set key */ 3065 CRYP_KeyInit(&keyInit); 3098 3066 3099 3067 /* set iv */ 3100 AES_CRYP_IVInitStructure.CRYP_IV0Left = ByteReverseWord32(iv[0]); 3101 AES_CRYP_IVInitStructure.CRYP_IV0Right = ByteReverseWord32(iv[1]); 3102 AES_CRYP_IVInitStructure.CRYP_IV1Left = ByteReverseWord32(iv[2]); 3103 AES_CRYP_IVInitStructure.CRYP_IV1Right = ByteReverseWord32(iv[3]); 3104 CRYP_IVInit(&AES_CRYP_IVInitStructure); 3105 3106 /* set direction, mode, and datatype */ 3107 AES_CRYP_InitStructure.CRYP_AlgoDir = CRYP_AlgoDir_Encrypt; 3108 AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_CTR; 3109 AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_8b; 3110 CRYP_Init(&AES_CRYP_InitStructure); 3068 iv = aes->reg; 3069 CRYP_IVStructInit(&ivInit); 3070 ivInit.CRYP_IV0Left = ByteReverseWord32(iv[0]); 3071 ivInit.CRYP_IV0Right = ByteReverseWord32(iv[1]); 3072 ivInit.CRYP_IV1Left = ByteReverseWord32(iv[2]); 3073 ivInit.CRYP_IV1Right = ByteReverseWord32(iv[3]); 3074 CRYP_IVInit(&ivInit); 3075 3076 /* set direction and mode */ 3077 cryptInit.CRYP_AlgoDir = CRYP_AlgoDir_Encrypt; 3078 cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_CTR; 3079 CRYP_Init(&cryptInit); 3111 3080 3112 3081 /* enable crypto processor */ … … 3188 3157 } 3189 3158 3159 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES) 3160 /* implemented in wolfcrypt/src/port/caam/caam_aes.c */ 3161 3162 #elif defined(WOLFSSL_AFALG) 3163 /* implemented in wolfcrypt/src/port/af_alg/afalg_aes.c */ 3164 3165 #elif defined(WOLFSSL_DEVCRYPTO_AES) 3166 /* implemented in wolfcrypt/src/port/devcrypt/devcrypto_aes.c */ 3167 3190 3168 #else 3191 3169 … … 3196 3174 #ifdef NEED_AES_CTR_SOFT 3197 3175 /* Increment AES counter */ 3198 static INLINE void IncrementAesCounter(byte* inOutCtr)3176 static WC_INLINE void IncrementAesCounter(byte* inOutCtr) 3199 3177 { 3200 3178 /* in network byte order so start at end and work back */ … … 3206 3184 } 3207 3185 3186 /* Software AES - CTR Encrypt */ 3208 3187 int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) 3209 3188 { … … 3258 3237 3259 3238 #endif /* WOLFSSL_AES_COUNTER */ 3239 #endif /* !WOLFSSL_ARMASM */ 3240 3241 3242 /* 3243 * The IV for AES GCM and CCM, stored in struct Aes's member reg, is comprised 3244 * of two parts in order: 3245 * 1. The fixed field which may be 0 or 4 bytes long. In TLS, this is set 3246 * to the implicit IV. 3247 * 2. The explicit IV is generated by wolfCrypt. It needs to be managed 3248 * by wolfCrypt to ensure the IV is unique for each call to encrypt. 3249 * The IV may be a 96-bit random value, or the 32-bit fixed value and a 3250 * 64-bit set of 0 or random data. The final 32-bits of reg is used as a 3251 * block counter during the encryption. 3252 */ 3253 3254 #if (defined(HAVE_AESGCM) && !defined(WC_NO_RNG)) || defined(HAVE_AESCCM) 3255 static WC_INLINE void IncCtr(byte* ctr, word32 ctrSz) 3256 { 3257 int i; 3258 for (i = ctrSz-1; i >= 0; i--) { 3259 if (++ctr[i]) 3260 break; 3261 } 3262 } 3263 #endif /* HAVE_AESGCM || HAVE_AESCCM */ 3260 3264 3261 3265 3262 3266 #ifdef HAVE_AESGCM 3263 3264 /*3265 * The IV for AES GCM, stored in struct Aes's member reg, is comprised of3266 * three parts in order:3267 * 1. The implicit IV. This is generated from the PRF using the shared3268 * secrets between endpoints. It is 4 bytes long.3269 * 2. The explicit IV. This is set by the user of the AES. It needs to be3270 * unique for each call to encrypt. The explicit IV is shared with the3271 * other end of the transaction in the clear.3272 * 3. The counter. Each block of data is encrypted with its own sequence3273 * number counter.3274 */3275 3267 3276 3268 #if defined(HAVE_COLDFIRE_SEC) … … 3282 3274 #endif 3283 3275 3284 enum { 3285 NONCE_SZ = 12, 3286 CTR_SZ = 4 3287 }; 3276 #ifdef WOLFSSL_ARMASM 3277 /* implementation is located in wolfcrypt/src/port/arm/armv8-aes.c */ 3278 3279 #elif defined(WOLFSSL_AFALG) 3280 /* implemented in wolfcrypt/src/port/afalg/afalg_aes.c */ 3281 3282 #elif defined(WOLFSSL_DEVCRYPTO_AES) 3283 /* implemented in wolfcrypt/src/port/devcrypt/devcrypto_aes.c */ 3284 3285 #else /* software + AESNI implementation */ 3288 3286 3289 3287 #if !defined(FREESCALE_LTC_AES_GCM) 3290 static INLINE void IncrementGcmCounter(byte* inOutCtr)3288 static WC_INLINE void IncrementGcmCounter(byte* inOutCtr) 3291 3289 { 3292 3290 int i; … … 3302 3300 #if defined(GCM_SMALL) || defined(GCM_TABLE) 3303 3301 3304 static INLINE void FlattenSzInBits(byte* buf, word32 sz)3302 static WC_INLINE void FlattenSzInBits(byte* buf, word32 sz) 3305 3303 { 3306 3304 /* Multiply the sz by 8 */ … … 3320 3318 3321 3319 3322 static INLINE void RIGHTSHIFTX(byte* x)3320 static WC_INLINE void RIGHTSHIFTX(byte* x) 3323 3321 { 3324 3322 int i; … … 3364 3362 #endif /* GCM_TABLE */ 3365 3363 3366 3364 /* Software AES - GCM SetKey */ 3367 3365 int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) 3368 3366 { 3369 3367 int ret; 3370 3368 byte iv[AES_BLOCK_SIZE]; 3369 3370 #ifdef WOLFSSL_IMX6_CAAM_BLOB 3371 byte local[32]; 3372 word32 localSz = 32; 3373 3374 if (len == (16 + WC_CAAM_BLOB_SZ) || 3375 len == (24 + WC_CAAM_BLOB_SZ) || 3376 len == (32 + WC_CAAM_BLOB_SZ)) { 3377 if (wc_caamOpenBlob((byte*)key, len, local, &localSz) != 0) { 3378 return BAD_FUNC_ARG; 3379 } 3380 3381 /* set local values */ 3382 key = local; 3383 len = localSz; 3384 } 3385 #endif 3371 3386 3372 3387 if (!((len == 16) || (len == 24) || (len == 32))) … … 3393 3408 #if defined(WOLFSSL_XILINX_CRYPT) 3394 3409 wc_AesGcmSetKey_ex(aes, key, len, XSECURE_CSU_AES_KEY_SRC_KUP); 3410 #endif 3411 3412 #ifdef WOLFSSL_IMX6_CAAM_BLOB 3413 ForceZero(local, sizeof(local)); 3395 3414 #endif 3396 3415 … … 3416 3435 #endif 3417 3436 3418 static const __m128i MOD2_128 = M128_INIT(0x1, 0xc200000000000000UL); 3437 static const __m128i MOD2_128 = M128_INIT(0x1, 3438 (long long int)0xc200000000000000UL); 3439 3440 3441 /* See Intel® Carry-Less Multiplication Instruction 3442 * and its Usage for Computing the GCM Mode White Paper 3443 * by Shay Gueron, Intel Mobility Group, Israel Development Center; 3444 * and Michael E. Kounavis, Intel Labs, Circuits and Systems Research */ 3445 3446 3447 /* Figure 9. AES-GCM – Encrypt With Single Block Ghash at a Time */ 3448 3449 static const __m128i ONE = M128_INIT(0x0, 0x1); 3450 #ifndef AES_GCM_AESNI_NO_UNROLL 3451 static const __m128i TWO = M128_INIT(0x0, 0x2); 3452 static const __m128i THREE = M128_INIT(0x0, 0x3); 3453 static const __m128i FOUR = M128_INIT(0x0, 0x4); 3454 static const __m128i FIVE = M128_INIT(0x0, 0x5); 3455 static const __m128i SIX = M128_INIT(0x0, 0x6); 3456 static const __m128i SEVEN = M128_INIT(0x0, 0x7); 3457 static const __m128i EIGHT = M128_INIT(0x0, 0x8); 3458 #endif 3459 static const __m128i BSWAP_EPI64 = M128_INIT(0x0001020304050607, 0x08090a0b0c0d0e0f); 3460 static const __m128i BSWAP_MASK = M128_INIT(0x08090a0b0c0d0e0f, 0x0001020304050607); 3461 3462 3463 #ifndef _MSC_VER 3464 3465 #define _VAR(a) "" #a "" 3466 #define VAR(a) _VAR(a) 3467 3468 #define HR %%xmm14 3469 #define XR %%xmm15 3470 #define KR %%ebx 3471 #define KR64 %%rbx 3472 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) 3473 #define CTR1 128(%%rsp) 3474 #define TR 144(%%rsp) 3475 #define HTR %%rsp 3476 #define STACK_OFFSET 160 3477 #else 3478 #define CTR1 (%%rsp) 3479 #define TR 16(%%rsp) 3480 #define STACK_OFFSET 32 3481 #endif 3482 3483 #define AESENC() \ 3484 "aesenc %%xmm12, %%xmm4\n\t" \ 3485 "aesenc %%xmm12, %%xmm5\n\t" \ 3486 "aesenc %%xmm12, %%xmm6\n\t" \ 3487 "aesenc %%xmm12, %%xmm7\n\t" \ 3488 "aesenc %%xmm12, %%xmm8\n\t" \ 3489 "aesenc %%xmm12, %%xmm9\n\t" \ 3490 "aesenc %%xmm12, %%xmm10\n\t" \ 3491 "aesenc %%xmm12, %%xmm11\n\t" 3492 3493 #define AESENC_SET(o) \ 3494 "movdqa " #o "(%[KEY]), %%xmm12\n\t" \ 3495 AESENC() 3496 3497 #define AESENC_CTR() \ 3498 "movdqu " VAR(CTR1) ", %%xmm4\n\t" \ 3499 "movdqa %[BSWAP_EPI64], %%xmm1\n\t" \ 3500 "movdqu %%xmm4, %%xmm0\n\t" \ 3501 "pshufb %%xmm1, %%xmm4\n\t" \ 3502 "movdqa %%xmm0, %%xmm5\n\t" \ 3503 "paddd %[ONE], %%xmm5\n\t" \ 3504 "pshufb %%xmm1, %%xmm5\n\t" \ 3505 "movdqa %%xmm0, %%xmm6\n\t" \ 3506 "paddd %[TWO], %%xmm6\n\t" \ 3507 "pshufb %%xmm1, %%xmm6\n\t" \ 3508 "movdqa %%xmm0, %%xmm7\n\t" \ 3509 "paddd %[THREE], %%xmm7\n\t" \ 3510 "pshufb %%xmm1, %%xmm7\n\t" \ 3511 "movdqa %%xmm0, %%xmm8\n\t" \ 3512 "paddd %[FOUR], %%xmm8\n\t" \ 3513 "pshufb %%xmm1, %%xmm8\n\t" \ 3514 "movdqa %%xmm0, %%xmm9\n\t" \ 3515 "paddd %[FIVE], %%xmm9\n\t" \ 3516 "pshufb %%xmm1, %%xmm9\n\t" \ 3517 "movdqa %%xmm0, %%xmm10\n\t" \ 3518 "paddd %[SIX], %%xmm10\n\t" \ 3519 "pshufb %%xmm1, %%xmm10\n\t" \ 3520 "movdqa %%xmm0, %%xmm11\n\t" \ 3521 "paddd %[SEVEN], %%xmm11\n\t" \ 3522 "pshufb %%xmm1, %%xmm11\n\t" \ 3523 "paddd %[EIGHT], %%xmm0\n\t" 3524 3525 #define AESENC_XOR() \ 3526 "movdqa (%[KEY]), %%xmm12\n\t" \ 3527 "movdqu %%xmm0, " VAR(CTR1) "\n\t" \ 3528 "pxor %%xmm12, %%xmm4\n\t" \ 3529 "pxor %%xmm12, %%xmm5\n\t" \ 3530 "pxor %%xmm12, %%xmm6\n\t" \ 3531 "pxor %%xmm12, %%xmm7\n\t" \ 3532 "pxor %%xmm12, %%xmm8\n\t" \ 3533 "pxor %%xmm12, %%xmm9\n\t" \ 3534 "pxor %%xmm12, %%xmm10\n\t" \ 3535 "pxor %%xmm12, %%xmm11\n\t" 3536 3537 /* Encrypt and carry-less multiply for AVX1. */ 3538 #define AESENC_PCLMUL_1(src, o1, o2, o3) \ 3539 "movdqu " #o3 "(" VAR(HTR) "), %%xmm12\n\t" \ 3540 "movdqu " #o2 "(" #src "), %%xmm0\n\t" \ 3541 "aesenc " #o1 "(%[KEY]), %%xmm4\n\t" \ 3542 "pshufb %[BSWAP_MASK], %%xmm0\n\t" \ 3543 "pxor %%xmm2, %%xmm0\n\t" \ 3544 "pshufd $0x4e, %%xmm12, %%xmm1\n\t" \ 3545 "pshufd $0x4e, %%xmm0, %%xmm14\n\t" \ 3546 "pxor %%xmm12, %%xmm1\n\t" \ 3547 "pxor %%xmm0, %%xmm14\n\t" \ 3548 "movdqa %%xmm0, %%xmm3\n\t" \ 3549 "pclmulqdq $0x11, %%xmm12, %%xmm3\n\t" \ 3550 "aesenc " #o1 "(%[KEY]), %%xmm5\n\t" \ 3551 "aesenc " #o1 "(%[KEY]), %%xmm6\n\t" \ 3552 "movdqa %%xmm0, %%xmm2\n\t" \ 3553 "pclmulqdq $0x00, %%xmm12, %%xmm2\n\t" \ 3554 "aesenc " #o1 "(%[KEY]), %%xmm7\n\t" \ 3555 "aesenc " #o1 "(%[KEY]), %%xmm8\n\t" \ 3556 "pclmulqdq $0x00, %%xmm14, %%xmm1\n\t" \ 3557 "aesenc " #o1 "(%[KEY]), %%xmm9\n\t" \ 3558 "aesenc " #o1 "(%[KEY]), %%xmm10\n\t" \ 3559 "aesenc " #o1 "(%[KEY]), %%xmm11\n\t" \ 3560 "pxor %%xmm2, %%xmm1\n\t" \ 3561 "pxor %%xmm3, %%xmm1\n\t" \ 3562 3563 #define AESENC_PCLMUL_N(src, o1, o2, o3) \ 3564 "movdqu " #o3 "(" VAR(HTR) "), %%xmm12\n\t" \ 3565 "movdqu " #o2 "(" #src" ), %%xmm0\n\t" \ 3566 "pshufd $0x4e, %%xmm12, %%xmm13\n\t" \ 3567 "pshufb %[BSWAP_MASK], %%xmm0\n\t" \ 3568 "aesenc " #o1 "(%[KEY]), %%xmm4\n\t" \ 3569 "pxor %%xmm12, %%xmm13\n\t" \ 3570 "pshufd $0x4e, %%xmm0, %%xmm14\n\t" \ 3571 "pxor %%xmm0, %%xmm14\n\t" \ 3572 "movdqa %%xmm0, %%xmm15\n\t" \ 3573 "pclmulqdq $0x11, %%xmm12, %%xmm15\n\t" \ 3574 "aesenc " #o1 "(%[KEY]), %%xmm5\n\t" \ 3575 "aesenc " #o1 "(%[KEY]), %%xmm6\n\t" \ 3576 "pclmulqdq $0x00, %%xmm0, %%xmm12\n\t" \ 3577 "aesenc " #o1 "(%[KEY]), %%xmm7\n\t" \ 3578 "aesenc " #o1 "(%[KEY]), %%xmm8\n\t" \ 3579 "pclmulqdq $0x00, %%xmm14, %%xmm13\n\t" \ 3580 "aesenc " #o1 "(%[KEY]), %%xmm9\n\t" \ 3581 "aesenc " #o1 "(%[KEY]), %%xmm10\n\t" \ 3582 "aesenc " #o1 "(%[KEY]), %%xmm11\n\t" \ 3583 "pxor %%xmm12, %%xmm1\n\t" \ 3584 "pxor %%xmm12, %%xmm2\n\t" \ 3585 "pxor %%xmm15, %%xmm1\n\t" \ 3586 "pxor %%xmm15, %%xmm3\n\t" \ 3587 "pxor %%xmm13, %%xmm1\n\t" \ 3588 3589 #define AESENC_PCLMUL_L(o) \ 3590 "movdqa %%xmm1, %%xmm14\n\t" \ 3591 "psrldq $8, %%xmm1\n\t" \ 3592 "pslldq $8, %%xmm14\n\t" \ 3593 "aesenc " #o "(%[KEY]), %%xmm4\n\t" \ 3594 "pxor %%xmm14, %%xmm2\n\t" \ 3595 "pxor %%xmm1, %%xmm3\n\t" \ 3596 "movdqa %%xmm2, %%xmm12\n\t" \ 3597 "movdqa %%xmm2, %%xmm13\n\t" \ 3598 "movdqa %%xmm2, %%xmm14\n\t" \ 3599 "aesenc " #o "(%[KEY]), %%xmm5\n\t" \ 3600 "pslld $31, %%xmm12\n\t" \ 3601 "pslld $30, %%xmm13\n\t" \ 3602 "pslld $25, %%xmm14\n\t" \ 3603 "aesenc " #o "(%[KEY]), %%xmm6\n\t" \ 3604 "pxor %%xmm13, %%xmm12\n\t" \ 3605 "pxor %%xmm14, %%xmm12\n\t" \ 3606 "aesenc " #o "(%[KEY]), %%xmm7\n\t" \ 3607 "movdqa %%xmm12, %%xmm13\n\t" \ 3608 "pslldq $12, %%xmm12\n\t" \ 3609 "psrldq $4, %%xmm13\n\t" \ 3610 "aesenc " #o "(%[KEY]), %%xmm8\n\t" \ 3611 "pxor %%xmm12, %%xmm2\n\t" \ 3612 "movdqa %%xmm2, %%xmm14\n\t" \ 3613 "movdqa %%xmm2, %%xmm1\n\t" \ 3614 "movdqa %%xmm2, %%xmm0\n\t" \ 3615 "aesenc " #o "(%[KEY]), %%xmm9\n\t" \ 3616 "psrld $1, %%xmm14\n\t" \ 3617 "psrld $2, %%xmm1\n\t" \ 3618 "psrld $7, %%xmm0\n\t" \ 3619 "aesenc " #o "(%[KEY]), %%xmm10\n\t" \ 3620 "pxor %%xmm1, %%xmm14\n\t" \ 3621 "pxor %%xmm0, %%xmm14\n\t" \ 3622 "aesenc " #o "(%[KEY]), %%xmm11\n\t" \ 3623 "pxor %%xmm13, %%xmm14\n\t" \ 3624 "pxor %%xmm14, %%xmm2\n\t" \ 3625 "pxor %%xmm3, %%xmm2\n\t" \ 3626 3627 /* Encrypt and carry-less multiply with last key. */ 3628 #define AESENC_LAST(in, out) \ 3629 "aesenclast %%xmm12, %%xmm4\n\t" \ 3630 "aesenclast %%xmm12, %%xmm5\n\t" \ 3631 "movdqu (" #in "),%%xmm0\n\t" \ 3632 "movdqu 16(" #in "),%%xmm1\n\t" \ 3633 "pxor %%xmm0, %%xmm4\n\t" \ 3634 "pxor %%xmm1, %%xmm5\n\t" \ 3635 "movdqu %%xmm4, (" #out ")\n\t" \ 3636 "movdqu %%xmm5, 16(" #out ")\n\t" \ 3637 "aesenclast %%xmm12, %%xmm6\n\t" \ 3638 "aesenclast %%xmm12, %%xmm7\n\t" \ 3639 "movdqu 32(" #in "),%%xmm0\n\t" \ 3640 "movdqu 48(" #in "),%%xmm1\n\t" \ 3641 "pxor %%xmm0, %%xmm6\n\t" \ 3642 "pxor %%xmm1, %%xmm7\n\t" \ 3643 "movdqu %%xmm6, 32(" #out ")\n\t" \ 3644 "movdqu %%xmm7, 48(" #out ")\n\t" \ 3645 "aesenclast %%xmm12, %%xmm8\n\t" \ 3646 "aesenclast %%xmm12, %%xmm9\n\t" \ 3647 "movdqu 64(" #in "),%%xmm0\n\t" \ 3648 "movdqu 80(" #in "),%%xmm1\n\t" \ 3649 "pxor %%xmm0, %%xmm8\n\t" \ 3650 "pxor %%xmm1, %%xmm9\n\t" \ 3651 "movdqu %%xmm8, 64(" #out ")\n\t" \ 3652 "movdqu %%xmm9, 80(" #out ")\n\t" \ 3653 "aesenclast %%xmm12, %%xmm10\n\t" \ 3654 "aesenclast %%xmm12, %%xmm11\n\t" \ 3655 "movdqu 96(" #in "),%%xmm0\n\t" \ 3656 "movdqu 112(" #in "),%%xmm1\n\t" \ 3657 "pxor %%xmm0, %%xmm10\n\t" \ 3658 "pxor %%xmm1, %%xmm11\n\t" \ 3659 "movdqu %%xmm10, 96(" #out ")\n\t" \ 3660 "movdqu %%xmm11, 112(" #out ")\n\t" 3661 3662 #define _AESENC_AVX(r) \ 3663 "aesenc 16(%[KEY]), " #r "\n\t" \ 3664 "aesenc 32(%[KEY]), " #r "\n\t" \ 3665 "aesenc 48(%[KEY]), " #r "\n\t" \ 3666 "aesenc 64(%[KEY]), " #r "\n\t" \ 3667 "aesenc 80(%[KEY]), " #r "\n\t" \ 3668 "aesenc 96(%[KEY]), " #r "\n\t" \ 3669 "aesenc 112(%[KEY]), " #r "\n\t" \ 3670 "aesenc 128(%[KEY]), " #r "\n\t" \ 3671 "aesenc 144(%[KEY]), " #r "\n\t" \ 3672 "cmpl $11, %[nr]\n\t" \ 3673 "movdqa 160(%[KEY]), %%xmm5\n\t" \ 3674 "jl %=f\n\t" \ 3675 "aesenc %%xmm5, " #r "\n\t" \ 3676 "aesenc 176(%[KEY]), " #r "\n\t" \ 3677 "cmpl $13, %[nr]\n\t" \ 3678 "movdqa 192(%[KEY]), %%xmm5\n\t" \ 3679 "jl %=f\n\t" \ 3680 "aesenc %%xmm5, " #r "\n\t" \ 3681 "aesenc 208(%[KEY]), " #r "\n\t" \ 3682 "movdqa 224(%[KEY]), %%xmm5\n\t" \ 3683 "%=:\n\t" \ 3684 "aesenclast %%xmm5, " #r "\n\t" 3685 #define AESENC_AVX(r) \ 3686 _AESENC_AVX(r) 3687 3688 #define AESENC_BLOCK(in, out) \ 3689 "movdqu " VAR(CTR1) ", %%xmm4\n\t" \ 3690 "movdqu %%xmm4, %%xmm5\n\t" \ 3691 "pshufb %[BSWAP_EPI64], %%xmm4\n\t" \ 3692 "paddd %[ONE], %%xmm5\n\t" \ 3693 "pxor (%[KEY]), %%xmm4\n\t" \ 3694 "movdqu %%xmm5, " VAR(CTR1) "\n\t" \ 3695 AESENC_AVX(%%xmm4) \ 3696 "movdqu (" #in "), %%xmm5\n\t" \ 3697 "pxor %%xmm5, %%xmm4\n\t" \ 3698 "movdqu %%xmm4, (" #out ")\n\t" \ 3699 "pshufb %[BSWAP_MASK], %%xmm4\n\t" \ 3700 "pxor %%xmm4, " VAR(XR) "\n\t" 3701 3702 #define _AESENC_GFMUL(in, out, H, X) \ 3703 "movdqu " VAR(CTR1) ", %%xmm4\n\t" \ 3704 "movdqu %%xmm4, %%xmm5\n\t" \ 3705 "pshufb %[BSWAP_EPI64], %%xmm4\n\t" \ 3706 "paddd %[ONE], %%xmm5\n\t" \ 3707 "pxor (%[KEY]), %%xmm4\n\t" \ 3708 "movdqu %%xmm5, " VAR(CTR1) "\n\t" \ 3709 "movdqa " #X ", %%xmm6\n\t" \ 3710 "pclmulqdq $0x10, " #H ", %%xmm6\n\t" \ 3711 "aesenc 16(%[KEY]), %%xmm4\n\t" \ 3712 "aesenc 32(%[KEY]), %%xmm4\n\t" \ 3713 "movdqa " #X ", %%xmm7\n\t" \ 3714 "pclmulqdq $0x01, " #H ", %%xmm7\n\t" \ 3715 "aesenc 48(%[KEY]), %%xmm4\n\t" \ 3716 "aesenc 64(%[KEY]), %%xmm4\n\t" \ 3717 "movdqa " #X ", %%xmm8\n\t" \ 3718 "pclmulqdq $0x00, " #H ", %%xmm8\n\t" \ 3719 "aesenc 80(%[KEY]), %%xmm4\n\t" \ 3720 "movdqa " #X ", %%xmm1\n\t" \ 3721 "pclmulqdq $0x11, " #H ", %%xmm1\n\t" \ 3722 "aesenc 96(%[KEY]), %%xmm4\n\t" \ 3723 "pxor %%xmm7, %%xmm6\n\t" \ 3724 "movdqa %%xmm6, %%xmm2\n\t" \ 3725 "psrldq $8, %%xmm6\n\t" \ 3726 "pslldq $8, %%xmm2\n\t" \ 3727 "aesenc 112(%[KEY]), %%xmm4\n\t" \ 3728 "movdqa %%xmm1, %%xmm3\n\t" \ 3729 "pxor %%xmm8, %%xmm2\n\t" \ 3730 "pxor %%xmm6, %%xmm3\n\t" \ 3731 "movdqa %[MOD2_128], %%xmm0\n\t" \ 3732 "movdqa %%xmm2, %%xmm7\n\t" \ 3733 "pclmulqdq $0x10, %%xmm0, %%xmm7\n\t" \ 3734 "aesenc 128(%[KEY]), %%xmm4\n\t" \ 3735 "pshufd $0x4e, %%xmm2, %%xmm6\n\t" \ 3736 "pxor %%xmm7, %%xmm6\n\t" \ 3737 "movdqa %%xmm6, %%xmm7\n\t" \ 3738 "pclmulqdq $0x10, %%xmm0, %%xmm7\n\t" \ 3739 "aesenc 144(%[KEY]), %%xmm4\n\t" \ 3740 "pshufd $0x4e, %%xmm6, " VAR(XR) "\n\t" \ 3741 "pxor %%xmm7, " VAR(XR) "\n\t" \ 3742 "pxor %%xmm3, " VAR(XR) "\n\t" \ 3743 "cmpl $11, %[nr]\n\t" \ 3744 "movdqu 160(%[KEY]), %%xmm5\n\t" \ 3745 "jl %=f\n\t" \ 3746 "aesenc %%xmm5, %%xmm4\n\t" \ 3747 "aesenc 176(%[KEY]), %%xmm4\n\t" \ 3748 "cmpl $13, %[nr]\n\t" \ 3749 "movdqu 192(%[KEY]), %%xmm5\n\t" \ 3750 "jl %=f\n\t" \ 3751 "aesenc %%xmm5, %%xmm4\n\t" \ 3752 "aesenc 208(%[KEY]), %%xmm4\n\t" \ 3753 "movdqa 224(%[KEY]), %%xmm5\n\t" \ 3754 "%=:\n\t" \ 3755 "aesenclast %%xmm5, %%xmm4\n\t" \ 3756 "movdqu (" #in "), %%xmm5\n\t" \ 3757 "pxor %%xmm5, %%xmm4\n\t" \ 3758 "movdqu %%xmm4, (" #out ")\n\t" 3759 #define AESENC_GFMUL(in, out, H, X) \ 3760 _AESENC_GFMUL(in, out, H, X) 3761 3762 #define _GHASH_GFMUL_AVX(r, r2, a, b) \ 3763 "pshufd $0x4e, "#a", %%xmm1\n\t" \ 3764 "pshufd $0x4e, "#b", %%xmm2\n\t" \ 3765 "movdqa "#b", %%xmm3\n\t" \ 3766 "movdqa "#b", %%xmm0\n\t" \ 3767 "pclmulqdq $0x11, "#a", %%xmm3\n\t" \ 3768 "pclmulqdq $0x00, "#a", %%xmm0\n\t" \ 3769 "pxor "#a", %%xmm1\n\t" \ 3770 "pxor "#b", %%xmm2\n\t" \ 3771 "pclmulqdq $0x00, %%xmm2, %%xmm1\n\t" \ 3772 "pxor %%xmm0, %%xmm1\n\t" \ 3773 "pxor %%xmm3, %%xmm1\n\t" \ 3774 "movdqa %%xmm1, %%xmm2\n\t" \ 3775 "movdqa %%xmm0, "#r2"\n\t" \ 3776 "movdqa %%xmm3, " #r "\n\t" \ 3777 "pslldq $8, %%xmm2\n\t" \ 3778 "psrldq $8, %%xmm1\n\t" \ 3779 "pxor %%xmm2, "#r2"\n\t" \ 3780 "pxor %%xmm1, " #r "\n\t" 3781 #define GHASH_GFMUL_AVX(r, r2, a, b) \ 3782 _GHASH_GFMUL_AVX(r, r2, a, b) 3783 3784 #define _GHASH_GFMUL_XOR_AVX(r, r2, a, b) \ 3785 "pshufd $0x4e, "#a", %%xmm1\n\t" \ 3786 "pshufd $0x4e, "#b", %%xmm2\n\t" \ 3787 "movdqa "#b", %%xmm3\n\t" \ 3788 "movdqa "#b", %%xmm0\n\t" \ 3789 "pclmulqdq $0x11, "#a", %%xmm3\n\t" \ 3790 "pclmulqdq $0x00, "#a", %%xmm0\n\t" \ 3791 "pxor "#a", %%xmm1\n\t" \ 3792 "pxor "#b", %%xmm2\n\t" \ 3793 "pclmulqdq $0x00, %%xmm2, %%xmm1\n\t" \ 3794 "pxor %%xmm0, %%xmm1\n\t" \ 3795 "pxor %%xmm3, %%xmm1\n\t" \ 3796 "movdqa %%xmm1, %%xmm2\n\t" \ 3797 "pxor %%xmm0, "#r2"\n\t" \ 3798 "pxor %%xmm3, " #r "\n\t" \ 3799 "pslldq $8, %%xmm2\n\t" \ 3800 "psrldq $8, %%xmm1\n\t" \ 3801 "pxor %%xmm2, "#r2"\n\t" \ 3802 "pxor %%xmm1, " #r "\n\t" 3803 #define GHASH_GFMUL_XOR_AVX(r, r2, a, b) \ 3804 _GHASH_GFMUL_XOR_AVX(r, r2, a, b) 3805 3806 #define GHASH_MID_AVX(r, r2) \ 3807 "movdqa "#r2", %%xmm0\n\t" \ 3808 "movdqa " #r ", %%xmm1\n\t" \ 3809 "psrld $31, %%xmm0\n\t" \ 3810 "psrld $31, %%xmm1\n\t" \ 3811 "pslld $1, "#r2"\n\t" \ 3812 "pslld $1, " #r "\n\t" \ 3813 "movdqa %%xmm0, %%xmm2\n\t" \ 3814 "pslldq $4, %%xmm0\n\t" \ 3815 "psrldq $12, %%xmm2\n\t" \ 3816 "pslldq $4, %%xmm1\n\t" \ 3817 "por %%xmm2, " #r "\n\t" \ 3818 "por %%xmm0, "#r2"\n\t" \ 3819 "por %%xmm1, " #r "\n\t" 3820 3821 #define _GHASH_GFMUL_RED_AVX(r, a, b) \ 3822 "pshufd $0x4e, "#a", %%xmm5\n\t" \ 3823 "pshufd $0x4e, "#b", %%xmm6\n\t" \ 3824 "movdqa "#b", %%xmm7\n\t" \ 3825 "movdqa "#b", %%xmm4\n\t" \ 3826 "pclmulqdq $0x11, "#a", %%xmm7\n\t" \ 3827 "pclmulqdq $0x00, "#a", %%xmm4\n\t" \ 3828 "pxor "#a", %%xmm5\n\t" \ 3829 "pxor "#b", %%xmm6\n\t" \ 3830 "pclmulqdq $0x00, %%xmm6, %%xmm5\n\t" \ 3831 "pxor %%xmm4, %%xmm5\n\t" \ 3832 "pxor %%xmm7, %%xmm5\n\t" \ 3833 "movdqa %%xmm5, %%xmm6\n\t" \ 3834 "movdqa %%xmm7, " #r "\n\t" \ 3835 "pslldq $8, %%xmm6\n\t" \ 3836 "psrldq $8, %%xmm5\n\t" \ 3837 "pxor %%xmm6, %%xmm4\n\t" \ 3838 "pxor %%xmm5, " #r "\n\t" \ 3839 "movdqa %%xmm4, %%xmm8\n\t" \ 3840 "movdqa %%xmm4, %%xmm9\n\t" \ 3841 "movdqa %%xmm4, %%xmm10\n\t" \ 3842 "pslld $31, %%xmm8\n\t" \ 3843 "pslld $30, %%xmm9\n\t" \ 3844 "pslld $25, %%xmm10\n\t" \ 3845 "pxor %%xmm9, %%xmm8\n\t" \ 3846 "pxor %%xmm10, %%xmm8\n\t" \ 3847 "movdqa %%xmm8, %%xmm9\n\t" \ 3848 "psrldq $4, %%xmm9\n\t" \ 3849 "pslldq $12, %%xmm8\n\t" \ 3850 "pxor %%xmm8, %%xmm4\n\t" \ 3851 "movdqa %%xmm4, %%xmm10\n\t" \ 3852 "movdqa %%xmm4, %%xmm6\n\t" \ 3853 "movdqa %%xmm4, %%xmm5\n\t" \ 3854 "psrld $1, %%xmm10\n\t" \ 3855 "psrld $2, %%xmm6\n\t" \ 3856 "psrld $7, %%xmm5\n\t" \ 3857 "pxor %%xmm6, %%xmm10\n\t" \ 3858 "pxor %%xmm5, %%xmm10\n\t" \ 3859 "pxor %%xmm9, %%xmm10\n\t" \ 3860 "pxor %%xmm4, %%xmm10\n\t" \ 3861 "pxor %%xmm10, " #r "\n\t" 3862 #define GHASH_GFMUL_RED_AVX(r, a, b) \ 3863 _GHASH_GFMUL_RED_AVX(r, a, b) 3864 3865 #define GHASH_RED_AVX(r, r2) \ 3866 "movdqa "#r2", %%xmm0\n\t" \ 3867 "movdqa "#r2", %%xmm1\n\t" \ 3868 "movdqa "#r2", %%xmm2\n\t" \ 3869 "pslld $31, %%xmm0\n\t" \ 3870 "pslld $30, %%xmm1\n\t" \ 3871 "pslld $25, %%xmm2\n\t" \ 3872 "pxor %%xmm1, %%xmm0\n\t" \ 3873 "pxor %%xmm2, %%xmm0\n\t" \ 3874 "movdqa %%xmm0, %%xmm1\n\t" \ 3875 "psrldq $4, %%xmm1\n\t" \ 3876 "pslldq $12, %%xmm0\n\t" \ 3877 "pxor %%xmm0, "#r2"\n\t" \ 3878 "movdqa "#r2", %%xmm2\n\t" \ 3879 "movdqa "#r2", %%xmm3\n\t" \ 3880 "movdqa "#r2", %%xmm0\n\t" \ 3881 "psrld $1, %%xmm2\n\t" \ 3882 "psrld $2, %%xmm3\n\t" \ 3883 "psrld $7, %%xmm0\n\t" \ 3884 "pxor %%xmm3, %%xmm2\n\t" \ 3885 "pxor %%xmm0, %%xmm2\n\t" \ 3886 "pxor %%xmm1, %%xmm2\n\t" \ 3887 "pxor "#r2", %%xmm2\n\t" \ 3888 "pxor %%xmm2, " #r "\n\t" 3889 3890 #define GHASH_GFMUL_RED_XOR_AVX(r, r2, a, b) \ 3891 GHASH_GFMUL_XOR_AVX(r, r2, a, b) \ 3892 GHASH_RED_AVX(r, r2) 3893 3894 #define GHASH_FULL_AVX(r, r2, a, b) \ 3895 GHASH_GFMUL_AVX(r, r2, a, b) \ 3896 GHASH_MID_AVX(r, r2) \ 3897 GHASH_RED_AVX(r, r2) 3898 3899 #define CALC_IV_12() \ 3900 "# Calculate values when IV is 12 bytes\n\t" \ 3901 "# Set counter based on IV\n\t" \ 3902 "movl $0x01000000, %%ecx\n\t" \ 3903 "pinsrq $0, 0(%%rax), %%xmm13\n\t" \ 3904 "pinsrd $2, 8(%%rax), %%xmm13\n\t" \ 3905 "pinsrd $3, %%ecx, %%xmm13\n\t" \ 3906 "# H = Encrypt X(=0) and T = Encrypt counter\n\t" \ 3907 "movdqu %%xmm13, %%xmm1\n\t" \ 3908 "movdqa 0(%[KEY]), " VAR(HR) "\n\t" \ 3909 "pxor " VAR(HR) ", %%xmm1\n\t" \ 3910 "movdqa 16(%[KEY]), %%xmm12\n\t" \ 3911 "aesenc %%xmm12, " VAR(HR) "\n\t" \ 3912 "aesenc %%xmm12, %%xmm1\n\t" \ 3913 "movdqa 32(%[KEY]), %%xmm12\n\t" \ 3914 "aesenc %%xmm12, " VAR(HR) "\n\t" \ 3915 "aesenc %%xmm12, %%xmm1\n\t" \ 3916 "movdqa 48(%[KEY]), %%xmm12\n\t" \ 3917 "aesenc %%xmm12, " VAR(HR) "\n\t" \ 3918 "aesenc %%xmm12, %%xmm1\n\t" \ 3919 "movdqa 64(%[KEY]), %%xmm12\n\t" \ 3920 "aesenc %%xmm12, " VAR(HR) "\n\t" \ 3921 "aesenc %%xmm12, %%xmm1\n\t" \ 3922 "movdqa 80(%[KEY]), %%xmm12\n\t" \ 3923 "aesenc %%xmm12, " VAR(HR) "\n\t" \ 3924 "aesenc %%xmm12, %%xmm1\n\t" \ 3925 "movdqa 96(%[KEY]), %%xmm12\n\t" \ 3926 "aesenc %%xmm12, " VAR(HR) "\n\t" \ 3927 "aesenc %%xmm12, %%xmm1\n\t" \ 3928 "movdqa 112(%[KEY]), %%xmm12\n\t" \ 3929 "aesenc %%xmm12, " VAR(HR) "\n\t" \ 3930 "aesenc %%xmm12, %%xmm1\n\t" \ 3931 "movdqa 128(%[KEY]), %%xmm12\n\t" \ 3932 "aesenc %%xmm12, " VAR(HR) "\n\t" \ 3933 "aesenc %%xmm12, %%xmm1\n\t" \ 3934 "movdqa 144(%[KEY]), %%xmm12\n\t" \ 3935 "aesenc %%xmm12, " VAR(HR) "\n\t" \ 3936 "aesenc %%xmm12, %%xmm1\n\t" \ 3937 "cmpl $11, %[nr]\n\t" \ 3938 "movdqa 160(%[KEY]), %%xmm12\n\t" \ 3939 "jl 31f\n\t" \ 3940 "aesenc %%xmm12, " VAR(HR) "\n\t" \ 3941 "aesenc %%xmm12, %%xmm1\n\t" \ 3942 "movdqa 176(%[KEY]), %%xmm12\n\t" \ 3943 "aesenc %%xmm12, " VAR(HR) "\n\t" \ 3944 "aesenc %%xmm12, %%xmm1\n\t" \ 3945 "cmpl $13, %[nr]\n\t" \ 3946 "movdqa 192(%[KEY]), %%xmm12\n\t" \ 3947 "jl 31f\n\t" \ 3948 "aesenc %%xmm12, " VAR(HR) "\n\t" \ 3949 "aesenc %%xmm12, %%xmm1\n\t" \ 3950 "movdqu 208(%[KEY]), %%xmm12\n\t" \ 3951 "aesenc %%xmm12, " VAR(HR) "\n\t" \ 3952 "aesenc %%xmm12, %%xmm1\n\t" \ 3953 "movdqu 224(%[KEY]), %%xmm12\n\t" \ 3954 "31:\n\t" \ 3955 "aesenclast %%xmm12, " VAR(HR) "\n\t" \ 3956 "aesenclast %%xmm12, %%xmm1\n\t" \ 3957 "pshufb %[BSWAP_MASK], " VAR(HR) "\n\t" \ 3958 "movdqu %%xmm1, " VAR(TR) "\n\t" \ 3959 "jmp 39f\n\t" 3960 3961 #define CALC_IV() \ 3962 "# Calculate values when IV is not 12 bytes\n\t" \ 3963 "# H = Encrypt X(=0)\n\t" \ 3964 "movdqa 0(%[KEY]), " VAR(HR) "\n\t" \ 3965 AESENC_AVX(HR) \ 3966 "pshufb %[BSWAP_MASK], " VAR(HR) "\n\t" \ 3967 "# Calc counter\n\t" \ 3968 "# Initialization vector\n\t" \ 3969 "cmpl $0, %%edx\n\t" \ 3970 "movq $0, %%rcx\n\t" \ 3971 "je 45f\n\t" \ 3972 "cmpl $16, %%edx\n\t" \ 3973 "jl 44f\n\t" \ 3974 "andl $0xfffffff0, %%edx\n\t" \ 3975 "\n" \ 3976 "43:\n\t" \ 3977 "movdqu (%%rax,%%rcx,1), %%xmm4\n\t" \ 3978 "pshufb %[BSWAP_MASK], %%xmm4\n\t" \ 3979 "pxor %%xmm4, %%xmm13\n\t" \ 3980 GHASH_FULL_AVX(%%xmm13, %%xmm12, %%xmm13, HR) \ 3981 "addl $16, %%ecx\n\t" \ 3982 "cmpl %%edx, %%ecx\n\t" \ 3983 "jl 43b\n\t" \ 3984 "movl %[ibytes], %%edx\n\t" \ 3985 "cmpl %%edx, %%ecx\n\t" \ 3986 "je 45f\n\t" \ 3987 "\n" \ 3988 "44:\n\t" \ 3989 "subq $16, %%rsp\n\t" \ 3990 "pxor %%xmm4, %%xmm4\n\t" \ 3991 "xorl %%ebx, %%ebx\n\t" \ 3992 "movdqu %%xmm4, (%%rsp)\n\t" \ 3993 "42:\n\t" \ 3994 "movzbl (%%rax,%%rcx,1), %%r13d\n\t" \ 3995 "movb %%r13b, (%%rsp,%%rbx,1)\n\t" \ 3996 "incl %%ecx\n\t" \ 3997 "incl %%ebx\n\t" \ 3998 "cmpl %%edx, %%ecx\n\t" \ 3999 "jl 42b\n\t" \ 4000 "movdqu (%%rsp), %%xmm4\n\t" \ 4001 "addq $16, %%rsp\n\t" \ 4002 "pshufb %[BSWAP_MASK], %%xmm4\n\t" \ 4003 "pxor %%xmm4, %%xmm13\n\t" \ 4004 GHASH_FULL_AVX(%%xmm13, %%xmm12, %%xmm13, HR) \ 4005 "\n" \ 4006 "45:\n\t" \ 4007 "# T = Encrypt counter\n\t" \ 4008 "pxor %%xmm0, %%xmm0\n\t" \ 4009 "shll $3, %%edx\n\t" \ 4010 "pinsrq $0, %%rdx, %%xmm0\n\t" \ 4011 "pxor %%xmm0, %%xmm13\n\t" \ 4012 GHASH_FULL_AVX(%%xmm13, %%xmm12, %%xmm13, HR) \ 4013 "pshufb %[BSWAP_MASK], %%xmm13\n\t" \ 4014 "# Encrypt counter\n\t" \ 4015 "movdqa 0(%[KEY]), %%xmm4\n\t" \ 4016 "pxor %%xmm13, %%xmm4\n\t" \ 4017 AESENC_AVX(%%xmm4) \ 4018 "movdqu %%xmm4, " VAR(TR) "\n\t" 4019 4020 #define CALC_AAD() \ 4021 "# Additional authentication data\n\t" \ 4022 "movl %[abytes], %%edx\n\t" \ 4023 "cmpl $0, %%edx\n\t" \ 4024 "je 25f\n\t" \ 4025 "movq %[addt], %%rax\n\t" \ 4026 "xorl %%ecx, %%ecx\n\t" \ 4027 "cmpl $16, %%edx\n\t" \ 4028 "jl 24f\n\t" \ 4029 "andl $0xfffffff0, %%edx\n\t" \ 4030 "\n" \ 4031 "23:\n\t" \ 4032 "movdqu (%%rax,%%rcx,1), %%xmm4\n\t" \ 4033 "pshufb %[BSWAP_MASK], %%xmm4\n\t" \ 4034 "pxor %%xmm4, " VAR(XR) "\n\t" \ 4035 GHASH_FULL_AVX(XR, %%xmm12, XR, HR) \ 4036 "addl $16, %%ecx\n\t" \ 4037 "cmpl %%edx, %%ecx\n\t" \ 4038 "jl 23b\n\t" \ 4039 "movl %[abytes], %%edx\n\t" \ 4040 "cmpl %%edx, %%ecx\n\t" \ 4041 "je 25f\n\t" \ 4042 "\n" \ 4043 "24:\n\t" \ 4044 "subq $16, %%rsp\n\t" \ 4045 "pxor %%xmm4, %%xmm4\n\t" \ 4046 "xorl %%ebx, %%ebx\n\t" \ 4047 "movdqu %%xmm4, (%%rsp)\n\t" \ 4048 "22:\n\t" \ 4049 "movzbl (%%rax,%%rcx,1), %%r13d\n\t" \ 4050 "movb %%r13b, (%%rsp,%%rbx,1)\n\t" \ 4051 "incl %%ecx\n\t" \ 4052 "incl %%ebx\n\t" \ 4053 "cmpl %%edx, %%ecx\n\t" \ 4054 "jl 22b\n\t" \ 4055 "movdqu (%%rsp), %%xmm4\n\t" \ 4056 "addq $16, %%rsp\n\t" \ 4057 "pshufb %[BSWAP_MASK], %%xmm4\n\t" \ 4058 "pxor %%xmm4, " VAR(XR) "\n\t" \ 4059 GHASH_FULL_AVX(XR, %%xmm12, XR, HR) \ 4060 "\n" \ 4061 "25:\n\t" 4062 4063 #define CALC_HT_8_AVX() \ 4064 "movdqa " VAR(XR) ", %%xmm2\n\t" \ 4065 "# H ^ 1\n\t" \ 4066 "movdqu " VAR(HR) ", 0(" VAR(HTR) ")\n\t" \ 4067 "# H ^ 2\n\t" \ 4068 GHASH_GFMUL_RED_AVX(%%xmm0, HR, HR) \ 4069 "movdqu %%xmm0 , 16(" VAR(HTR) ")\n\t" \ 4070 "# H ^ 3\n\t" \ 4071 GHASH_GFMUL_RED_AVX(%%xmm1, HR, %%xmm0) \ 4072 "movdqu %%xmm1 , 32(" VAR(HTR) ")\n\t" \ 4073 "# H ^ 4\n\t" \ 4074 GHASH_GFMUL_RED_AVX(%%xmm3, %%xmm0, %%xmm0) \ 4075 "movdqu %%xmm3 , 48(" VAR(HTR) ")\n\t" \ 4076 "# H ^ 5\n\t" \ 4077 GHASH_GFMUL_RED_AVX(%%xmm12, %%xmm0, %%xmm1) \ 4078 "movdqu %%xmm12, 64(" VAR(HTR) ")\n\t" \ 4079 "# H ^ 6\n\t" \ 4080 GHASH_GFMUL_RED_AVX(%%xmm12, %%xmm1, %%xmm1) \ 4081 "movdqu %%xmm12, 80(" VAR(HTR) ")\n\t" \ 4082 "# H ^ 7\n\t" \ 4083 GHASH_GFMUL_RED_AVX(%%xmm12, %%xmm1, %%xmm3) \ 4084 "movdqu %%xmm12, 96(" VAR(HTR) ")\n\t" \ 4085 "# H ^ 8\n\t" \ 4086 GHASH_GFMUL_RED_AVX(%%xmm12, %%xmm3, %%xmm3) \ 4087 "movdqu %%xmm12, 112(" VAR(HTR) ")\n\t" 4088 4089 #define AESENC_128_GHASH_AVX(src, o) \ 4090 "leaq (%[in]," VAR(KR64) ",1), %%rcx\n\t" \ 4091 "leaq (%[out]," VAR(KR64) ",1), %%rdx\n\t" \ 4092 /* src is either %%rcx or %%rdx */ \ 4093 AESENC_CTR() \ 4094 AESENC_XOR() \ 4095 AESENC_PCLMUL_1(src, 16, o-128, 112) \ 4096 AESENC_PCLMUL_N(src, 32, o-112, 96) \ 4097 AESENC_PCLMUL_N(src, 48, o -96, 80) \ 4098 AESENC_PCLMUL_N(src, 64, o -80, 64) \ 4099 AESENC_PCLMUL_N(src, 80, o -64, 48) \ 4100 AESENC_PCLMUL_N(src, 96, o -48, 32) \ 4101 AESENC_PCLMUL_N(src, 112, o -32, 16) \ 4102 AESENC_PCLMUL_N(src, 128, o -16, 0) \ 4103 AESENC_PCLMUL_L(144) \ 4104 "cmpl $11, %[nr]\n\t" \ 4105 "movdqa 160(%[KEY]), %%xmm12\n\t" \ 4106 "jl 4f\n\t" \ 4107 AESENC() \ 4108 AESENC_SET(176) \ 4109 "cmpl $13, %[nr]\n\t" \ 4110 "movdqa 192(%[KEY]), %%xmm12\n\t" \ 4111 "jl 4f\n\t" \ 4112 AESENC() \ 4113 AESENC_SET(208) \ 4114 "movdqa 224(%[KEY]), %%xmm12\n\t" \ 4115 "\n" \ 4116 "4:\n\t" \ 4117 AESENC_LAST(%%rcx, %%rdx) 4118 4119 #define AESENC_LAST15_ENC_AVX() \ 4120 "movl %[nbytes], %%ecx\n\t" \ 4121 "movl %%ecx, %%edx\n\t" \ 4122 "andl $0x0f, %%ecx\n\t" \ 4123 "jz 55f\n\t" \ 4124 "movdqu " VAR(CTR1) ", %%xmm13\n\t" \ 4125 "pshufb %[BSWAP_EPI64], %%xmm13\n\t" \ 4126 "pxor 0(%[KEY]), %%xmm13\n\t" \ 4127 AESENC_AVX(%%xmm13) \ 4128 "subq $16, %%rsp\n\t" \ 4129 "xorl %%ecx, %%ecx\n\t" \ 4130 "movdqu %%xmm13, (%%rsp)\n\t" \ 4131 "\n" \ 4132 "51:\n\t" \ 4133 "movzbl (%[in]," VAR(KR64) ",1), %%r13d\n\t" \ 4134 "xorb (%%rsp,%%rcx,1), %%r13b\n\t" \ 4135 "movb %%r13b, (%[out]," VAR(KR64) ",1)\n\t" \ 4136 "movb %%r13b, (%%rsp,%%rcx,1)\n\t" \ 4137 "incl " VAR(KR) "\n\t" \ 4138 "incl %%ecx\n\t" \ 4139 "cmpl %%edx, " VAR(KR) "\n\t" \ 4140 "jl 51b\n\t" \ 4141 "xorq %%r13, %%r13\n\t" \ 4142 "cmpl $16, %%ecx\n\t" \ 4143 "je 53f\n\t" \ 4144 "\n" \ 4145 "52:\n\t" \ 4146 "movb %%r13b, (%%rsp,%%rcx,1)\n\t" \ 4147 "incl %%ecx\n\t" \ 4148 "cmpl $16, %%ecx\n\t" \ 4149 "jl 52b\n\t" \ 4150 "53:\n\t" \ 4151 "movdqu (%%rsp), %%xmm13\n\t" \ 4152 "addq $16, %%rsp\n\t" \ 4153 "pshufb %[BSWAP_MASK], %%xmm13\n\t" \ 4154 "pxor %%xmm13, " VAR(XR) "\n\t" \ 4155 GHASH_GFMUL_RED_AVX(XR, HR, XR) \ 4156 4157 #define AESENC_LAST15_DEC_AVX() \ 4158 "movl %[nbytes], %%ecx\n\t" \ 4159 "movl %%ecx, %%edx\n\t" \ 4160 "andl $0x0f, %%ecx\n\t" \ 4161 "jz 55f\n\t" \ 4162 "movdqu " VAR(CTR1) ", %%xmm13\n\t" \ 4163 "pshufb %[BSWAP_EPI64], %%xmm13\n\t" \ 4164 "pxor 0(%[KEY]), %%xmm13\n\t" \ 4165 AESENC_AVX(%%xmm13) \ 4166 "subq $32, %%rsp\n\t" \ 4167 "xorl %%ecx, %%ecx\n\t" \ 4168 "movdqu %%xmm13, (%%rsp)\n\t" \ 4169 "pxor %%xmm0, %%xmm0\n\t" \ 4170 "movdqu %%xmm0, 16(%%rsp)\n\t" \ 4171 "\n" \ 4172 "51:\n\t" \ 4173 "movzbl (%[in]," VAR(KR64) ",1), %%r13d\n\t" \ 4174 "movb %%r13b, 16(%%rsp,%%rcx,1)\n\t" \ 4175 "xorb (%%rsp,%%rcx,1), %%r13b\n\t" \ 4176 "movb %%r13b, (%[out]," VAR(KR64) ",1)\n\t" \ 4177 "incl " VAR(KR) "\n\t" \ 4178 "incl %%ecx\n\t" \ 4179 "cmpl %%edx, " VAR(KR) "\n\t" \ 4180 "jl 51b\n\t" \ 4181 "53:\n\t" \ 4182 "movdqu 16(%%rsp), %%xmm13\n\t" \ 4183 "addq $32, %%rsp\n\t" \ 4184 "pshufb %[BSWAP_MASK], %%xmm13\n\t" \ 4185 "pxor %%xmm13, " VAR(XR) "\n\t" \ 4186 GHASH_GFMUL_RED_AVX(XR, HR, XR) \ 4187 4188 #define CALC_TAG() \ 4189 "movl %[nbytes], %%edx\n\t" \ 4190 "movl %[abytes], %%ecx\n\t" \ 4191 "shlq $3, %%rdx\n\t" \ 4192 "shlq $3, %%rcx\n\t" \ 4193 "pinsrq $0, %%rdx, %%xmm0\n\t" \ 4194 "pinsrq $1, %%rcx, %%xmm0\n\t" \ 4195 "pxor %%xmm0, " VAR(XR) "\n\t" \ 4196 GHASH_GFMUL_RED_AVX(XR, HR, XR) \ 4197 "pshufb %[BSWAP_MASK], " VAR(XR) "\n\t" \ 4198 "movdqu " VAR(TR) ", %%xmm0\n\t" \ 4199 "pxor " VAR(XR) ", %%xmm0\n\t" \ 4200 4201 #define STORE_TAG() \ 4202 "cmpl $16, %[tbytes]\n\t" \ 4203 "je 71f\n\t" \ 4204 "xorq %%rcx, %%rcx\n\t" \ 4205 "movdqu %%xmm0, (%%rsp)\n\t" \ 4206 "73:\n\t" \ 4207 "movzbl (%%rsp,%%rcx,1), %%r13d\n\t" \ 4208 "movb %%r13b, (%[tag],%%rcx,1)\n\t" \ 4209 "incl %%ecx\n\t" \ 4210 "cmpl %[tbytes], %%ecx\n\t" \ 4211 "jne 73b\n\t" \ 4212 "jmp 72f\n\t" \ 4213 "\n" \ 4214 "71:\n\t" \ 4215 "movdqu %%xmm0, (%[tag])\n\t" \ 4216 "\n" \ 4217 "72:\n\t" 4218 4219 #define CMP_TAG() \ 4220 "cmpl $16, %[tbytes]\n\t" \ 4221 "je 71f\n\t" \ 4222 "subq $16, %%rsp\n\t" \ 4223 "xorq %%rcx, %%rcx\n\t" \ 4224 "xorq %%rax, %%rax\n\t" \ 4225 "movdqu %%xmm0, (%%rsp)\n\t" \ 4226 "\n" \ 4227 "73:\n\t" \ 4228 "movzbl (%%rsp,%%rcx,1), %%r13d\n\t" \ 4229 "xorb (%[tag],%%rcx,1), %%r13b\n\t" \ 4230 "orb %%r13b, %%al\n\t" \ 4231 "incl %%ecx\n\t" \ 4232 "cmpl %[tbytes], %%ecx\n\t" \ 4233 "jne 73b\n\t" \ 4234 "cmpb $0x00, %%al\n\t" \ 4235 "sete %%al\n\t" \ 4236 "addq $16, %%rsp\n\t" \ 4237 "xorq %%rcx, %%rcx\n\t" \ 4238 "jmp 72f\n\t" \ 4239 "\n" \ 4240 "71:\n\t" \ 4241 "movdqu (%[tag]), %%xmm1\n\t" \ 4242 "pcmpeqb %%xmm1, %%xmm0\n\t" \ 4243 "pmovmskb %%xmm0, %%edx\n\t" \ 4244 "# %%edx == 0xFFFF then return 1 else => return 0\n\t" \ 4245 "xorl %%eax, %%eax\n\t" \ 4246 "cmpl $0xffff, %%edx\n\t" \ 4247 "sete %%al\n\t" \ 4248 "\n" \ 4249 "72:\n\t" \ 4250 "movl %%eax, (%[res])\n\t" 4251 4252 static void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, 4253 const unsigned char* addt, 4254 const unsigned char* ivec, unsigned char *tag, 4255 unsigned int nbytes, unsigned int abytes, 4256 unsigned int ibytes, unsigned int tbytes, 4257 const unsigned char* key, int nr) 4258 { 4259 register const unsigned char* iv asm("rax") = ivec; 4260 register unsigned int ivLen asm("ebx") = ibytes; 4261 4262 __asm__ __volatile__ ( 4263 "subq $" VAR(STACK_OFFSET) ", %%rsp\n\t" 4264 /* Counter is xmm13 */ 4265 "pxor %%xmm13, %%xmm13\n\t" 4266 "pxor " VAR(XR) ", " VAR(XR) "\n\t" 4267 "movl %[ibytes], %%edx\n\t" 4268 "cmpl $12, %%edx\n\t" 4269 "jne 35f\n\t" 4270 CALC_IV_12() 4271 "\n" 4272 "35:\n\t" 4273 CALC_IV() 4274 "\n" 4275 "39:\n\t" 4276 4277 CALC_AAD() 4278 4279 "# Calculate counter and H\n\t" 4280 "pshufb %[BSWAP_EPI64], %%xmm13\n\t" 4281 "movdqa " VAR(HR) ", %%xmm5\n\t" 4282 "paddd %[ONE], %%xmm13\n\t" 4283 "movdqa " VAR(HR) ", %%xmm4\n\t" 4284 "movdqu %%xmm13, " VAR(CTR1) "\n\t" 4285 "psrlq $63, %%xmm5\n\t" 4286 "psllq $1, %%xmm4\n\t" 4287 "pslldq $8, %%xmm5\n\t" 4288 "por %%xmm5, %%xmm4\n\t" 4289 "pshufd $0xff, " VAR(HR) ", " VAR(HR) "\n\t" 4290 "psrad $31, " VAR(HR) "\n\t" 4291 "pand %[MOD2_128], " VAR(HR) "\n\t" 4292 "pxor %%xmm4, " VAR(HR) "\n\t" 4293 4294 "xorl " VAR(KR) ", " VAR(KR) "\n\t" 4295 4296 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) 4297 "cmpl $128, %[nbytes]\n\t" 4298 "movl %[nbytes], %%r13d\n\t" 4299 "jl 5f\n\t" 4300 "andl $0xffffff80, %%r13d\n\t" 4301 4302 CALC_HT_8_AVX() 4303 4304 "# First 128 bytes of input\n\t" 4305 AESENC_CTR() 4306 AESENC_XOR() 4307 AESENC_SET(16) 4308 AESENC_SET(32) 4309 AESENC_SET(48) 4310 AESENC_SET(64) 4311 AESENC_SET(80) 4312 AESENC_SET(96) 4313 AESENC_SET(112) 4314 AESENC_SET(128) 4315 AESENC_SET(144) 4316 "cmpl $11, %[nr]\n\t" 4317 "movdqa 160(%[KEY]), %%xmm12\n\t" 4318 "jl 1f\n\t" 4319 AESENC() 4320 AESENC_SET(176) 4321 "cmpl $13, %[nr]\n\t" 4322 "movdqa 192(%[KEY]), %%xmm12\n\t" 4323 "jl 1f\n\t" 4324 AESENC() 4325 AESENC_SET(208) 4326 "movdqa 224(%[KEY]), %%xmm12\n\t" 4327 "\n" 4328 "1:\n\t" 4329 AESENC_LAST(%[in], %[out]) 4330 4331 "cmpl $128, %%r13d\n\t" 4332 "movl $128, " VAR(KR) "\n\t" 4333 "jle 2f\n\t" 4334 4335 "# More 128 bytes of input\n\t" 4336 "\n" 4337 "3:\n\t" 4338 AESENC_128_GHASH_AVX(%%rdx, 0) 4339 "addl $128, " VAR(KR) "\n\t" 4340 "cmpl %%r13d, " VAR(KR) "\n\t" 4341 "jl 3b\n\t" 4342 "\n" 4343 "2:\n\t" 4344 "movdqa %[BSWAP_MASK], %%xmm13\n\t" 4345 "pshufb %%xmm13, %%xmm4\n\t" 4346 "pshufb %%xmm13, %%xmm5\n\t" 4347 "pshufb %%xmm13, %%xmm6\n\t" 4348 "pshufb %%xmm13, %%xmm7\n\t" 4349 "pxor %%xmm2, %%xmm4\n\t" 4350 "pshufb %%xmm13, %%xmm8\n\t" 4351 "pshufb %%xmm13, %%xmm9\n\t" 4352 "pshufb %%xmm13, %%xmm10\n\t" 4353 "pshufb %%xmm13, %%xmm11\n\t" 4354 4355 "movdqu 112(" VAR(HTR) "), %%xmm12\n\t" 4356 GHASH_GFMUL_AVX(XR, %%xmm13, %%xmm4, %%xmm12) 4357 "movdqu 96(" VAR(HTR) "), %%xmm12\n\t" 4358 GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm5, %%xmm12) 4359 "movdqu 80(" VAR(HTR) "), %%xmm12\n\t" 4360 GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm6, %%xmm12) 4361 "movdqu 64(" VAR(HTR) "), %%xmm12\n\t" 4362 GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm7, %%xmm12) 4363 "movdqu 48(" VAR(HTR) "), %%xmm12\n\t" 4364 GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm8, %%xmm12) 4365 "movdqu 32(" VAR(HTR) "), %%xmm12\n\t" 4366 GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm9, %%xmm12) 4367 "movdqu 16(" VAR(HTR) "), %%xmm12\n\t" 4368 GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm10, %%xmm12) 4369 "movdqu (" VAR(HTR) "), %%xmm12\n\t" 4370 GHASH_GFMUL_RED_XOR_AVX(XR, %%xmm13, %%xmm11, %%xmm12) 4371 4372 "movdqu 0(" VAR(HTR) "), " VAR(HR) "\n\t" 4373 "\n" 4374 "5:\n\t" 4375 "movl %[nbytes], %%edx\n\t" 4376 "cmpl %%edx, " VAR(KR) "\n\t" 4377 "jge 55f\n\t" 4378 #endif 4379 4380 "movl %[nbytes], %%r13d\n\t" 4381 "andl $0xfffffff0, %%r13d\n\t" 4382 "cmpl %%r13d, " VAR(KR) "\n\t" 4383 "jge 14f\n\t" 4384 4385 "leaq (%[in]," VAR(KR64) ",1), %%rcx\n\t" 4386 "leaq (%[out]," VAR(KR64) ",1), %%rdx\n\t" 4387 AESENC_BLOCK(%%rcx, %%rdx) 4388 "addl $16, " VAR(KR) "\n\t" 4389 "cmpl %%r13d, " VAR(KR) "\n\t" 4390 "jge 13f\n\t" 4391 "\n" 4392 "12:\n\t" 4393 "leaq (%[in]," VAR(KR64) ",1), %%rcx\n\t" 4394 "leaq (%[out]," VAR(KR64) ",1), %%rdx\n\t" 4395 AESENC_GFMUL(%%rcx, %%rdx, HR, XR) 4396 "pshufb %[BSWAP_MASK], %%xmm4\n\t" 4397 "pxor %%xmm4, " VAR(XR) "\n\t" 4398 "addl $16, " VAR(KR) "\n\t" 4399 "cmpl %%r13d, " VAR(KR) "\n\t" 4400 "jl 12b\n\t" 4401 "\n" 4402 "13:\n\t" 4403 GHASH_GFMUL_RED_AVX(XR, HR, XR) 4404 "\n" 4405 "14:\n\t" 4406 4407 AESENC_LAST15_ENC_AVX() 4408 "\n" 4409 "55:\n\t" 4410 4411 CALC_TAG() 4412 STORE_TAG() 4413 "addq $" VAR(STACK_OFFSET) ", %%rsp\n\t" 4414 4415 : 4416 : [KEY] "r" (key), 4417 [in] "r" (in), [out] "r" (out), [nr] "r" (nr), 4418 [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt), 4419 [ivec] "r" (iv), [ibytes] "r" (ivLen), [tbytes] "r" (tbytes), 4420 [tag] "r" (tag), 4421 [BSWAP_MASK] "m" (BSWAP_MASK), 4422 [BSWAP_EPI64] "m" (BSWAP_EPI64), 4423 [ONE] "m" (ONE), 4424 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) 4425 [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR), 4426 [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN), 4427 [EIGHT] "m" (EIGHT), 4428 #endif 4429 [MOD2_128] "m" (MOD2_128) 4430 : "xmm15", "xmm14", "xmm13", "xmm12", 4431 "xmm0", "xmm1", "xmm2", "xmm3", "memory", 4432 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", 4433 "rcx", "rdx", "r13" 4434 ); 4435 } 4436 4437 #ifdef HAVE_INTEL_AVX1 4438 /* Encrypt with key in xmm12. */ 4439 #define VAESENC() \ 4440 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" \ 4441 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" \ 4442 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" \ 4443 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" \ 4444 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" \ 4445 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" \ 4446 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" \ 4447 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" 4448 4449 #define VAESENC_SET(o) \ 4450 "vmovdqa "#o"(%[KEY]), %%xmm12\n\t" \ 4451 VAESENC() 4452 4453 #define VAESENC_CTR() \ 4454 "vmovdqu " VAR(CTR1) ", %%xmm0\n\t" \ 4455 "vmovdqa %[BSWAP_EPI64], %%xmm1\n\t" \ 4456 "vpshufb %%xmm1, %%xmm0, %%xmm4\n\t" \ 4457 "vpaddd %[ONE], %%xmm0, %%xmm5\n\t" \ 4458 "vpshufb %%xmm1, %%xmm5, %%xmm5\n\t" \ 4459 "vpaddd %[TWO], %%xmm0, %%xmm6\n\t" \ 4460 "vpshufb %%xmm1, %%xmm6, %%xmm6\n\t" \ 4461 "vpaddd %[THREE], %%xmm0, %%xmm7\n\t" \ 4462 "vpshufb %%xmm1, %%xmm7, %%xmm7\n\t" \ 4463 "vpaddd %[FOUR], %%xmm0, %%xmm8\n\t" \ 4464 "vpshufb %%xmm1, %%xmm8, %%xmm8\n\t" \ 4465 "vpaddd %[FIVE], %%xmm0, %%xmm9\n\t" \ 4466 "vpshufb %%xmm1, %%xmm9, %%xmm9\n\t" \ 4467 "vpaddd %[SIX], %%xmm0, %%xmm10\n\t" \ 4468 "vpshufb %%xmm1, %%xmm10, %%xmm10\n\t" \ 4469 "vpaddd %[SEVEN], %%xmm0, %%xmm11\n\t" \ 4470 "vpshufb %%xmm1, %%xmm11, %%xmm11\n\t" \ 4471 "vpaddd %[EIGHT], %%xmm0, %%xmm0\n\t" 4472 4473 #define VAESENC_XOR() \ 4474 "vmovdqa (%[KEY]), %%xmm12\n\t" \ 4475 "vmovdqu %%xmm0, " VAR(CTR1) "\n\t" \ 4476 "vpxor %%xmm12, %%xmm4, %%xmm4\n\t" \ 4477 "vpxor %%xmm12, %%xmm5, %%xmm5\n\t" \ 4478 "vpxor %%xmm12, %%xmm6, %%xmm6\n\t" \ 4479 "vpxor %%xmm12, %%xmm7, %%xmm7\n\t" \ 4480 "vpxor %%xmm12, %%xmm8, %%xmm8\n\t" \ 4481 "vpxor %%xmm12, %%xmm9, %%xmm9\n\t" \ 4482 "vpxor %%xmm12, %%xmm10, %%xmm10\n\t" \ 4483 "vpxor %%xmm12, %%xmm11, %%xmm11\n\t" 4484 4485 #define VAESENC_128() \ 4486 VAESENC_CTR() \ 4487 VAESENC_XOR() \ 4488 VAESENC_SET(16) \ 4489 VAESENC_SET(32) \ 4490 VAESENC_SET(48) \ 4491 VAESENC_SET(64) \ 4492 VAESENC_SET(80) \ 4493 VAESENC_SET(96) \ 4494 VAESENC_SET(112) \ 4495 VAESENC_SET(128) \ 4496 VAESENC_SET(144) \ 4497 "cmpl $11, %[nr]\n\t" \ 4498 "vmovdqa 160(%[KEY]), %%xmm12\n\t" \ 4499 "jl 1f\n\t" \ 4500 VAESENC() \ 4501 VAESENC_SET(176) \ 4502 "cmpl $13, %[nr]\n\t" \ 4503 "vmovdqa 192(%[KEY]), %%xmm12\n\t" \ 4504 "jl 1f\n\t" \ 4505 VAESENC() \ 4506 VAESENC_SET(208) \ 4507 "vmovdqa 224(%[KEY]), %%xmm12\n\t" \ 4508 "\n" \ 4509 "1:\n\t" \ 4510 VAESENC_LAST(%[in], %[out]) 4511 4512 /* Encrypt and carry-less multiply for AVX1. */ 4513 #define VAESENC_PCLMUL_1(src, o1, o2, o3) \ 4514 "vmovdqu " #o3 "(" VAR(HTR) "), %%xmm12\n\t" \ 4515 "vmovdqu " #o2 "(" #src "), %%xmm0\n\t" \ 4516 "vaesenc " #o1 "(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4517 "vpshufb %[BSWAP_MASK], %%xmm0, %%xmm0\n\t" \ 4518 "vpxor %%xmm2, %%xmm0, %%xmm0\n\t" \ 4519 "vpshufd $0x4e, %%xmm12, %%xmm1\n\t" \ 4520 "vpshufd $0x4e, %%xmm0, %%xmm14\n\t" \ 4521 "vpxor %%xmm12, %%xmm1, %%xmm1\n\t" \ 4522 "vpxor %%xmm0, %%xmm14, %%xmm14\n\t" \ 4523 "vpclmulqdq $0x11, %%xmm12, %%xmm0, %%xmm3\n\t" \ 4524 "vaesenc " #o1 "(%[KEY]), %%xmm5, %%xmm5\n\t" \ 4525 "vaesenc " #o1 "(%[KEY]), %%xmm6, %%xmm6\n\t" \ 4526 "vpclmulqdq $0x00, %%xmm12, %%xmm0, %%xmm2\n\t" \ 4527 "vaesenc " #o1 "(%[KEY]), %%xmm7, %%xmm7\n\t" \ 4528 "vaesenc " #o1 "(%[KEY]), %%xmm8, %%xmm8\n\t" \ 4529 "vpclmulqdq $0x00, %%xmm14, %%xmm1, %%xmm1\n\t" \ 4530 "vaesenc " #o1 "(%[KEY]), %%xmm9, %%xmm9\n\t" \ 4531 "vaesenc " #o1 "(%[KEY]), %%xmm10, %%xmm10\n\t" \ 4532 "vaesenc " #o1 "(%[KEY]), %%xmm11, %%xmm11\n\t" \ 4533 "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \ 4534 "vpxor %%xmm3, %%xmm1, %%xmm1\n\t" \ 4535 4536 #define VAESENC_PCLMUL_N(src, o1, o2, o3) \ 4537 "vmovdqu " #o3 "(" VAR(HTR) "), %%xmm12\n\t" \ 4538 "vmovdqu " #o2 "(" #src "), %%xmm0\n\t" \ 4539 "vpshufd $0x4e, %%xmm12, %%xmm13\n\t" \ 4540 "vpshufb %[BSWAP_MASK], %%xmm0, %%xmm0\n\t" \ 4541 "vaesenc " #o1 "(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4542 "vpxor %%xmm12, %%xmm13, %%xmm13\n\t" \ 4543 "vpshufd $0x4e, %%xmm0, %%xmm14\n\t" \ 4544 "vpxor %%xmm0, %%xmm14, %%xmm14\n\t" \ 4545 "vpclmulqdq $0x11, %%xmm12, %%xmm0, %%xmm15\n\t" \ 4546 "vaesenc " #o1 "(%[KEY]), %%xmm5, %%xmm5\n\t" \ 4547 "vaesenc " #o1 "(%[KEY]), %%xmm6, %%xmm6\n\t" \ 4548 "vpclmulqdq $0x00, %%xmm12, %%xmm0, %%xmm12\n\t" \ 4549 "vaesenc " #o1 "(%[KEY]), %%xmm7, %%xmm7\n\t" \ 4550 "vaesenc " #o1 "(%[KEY]), %%xmm8, %%xmm8\n\t" \ 4551 "vpclmulqdq $0x00, %%xmm14, %%xmm13, %%xmm13\n\t" \ 4552 "vaesenc " #o1 "(%[KEY]), %%xmm9, %%xmm9\n\t" \ 4553 "vaesenc " #o1 "(%[KEY]), %%xmm10, %%xmm10\n\t" \ 4554 "vaesenc " #o1 "(%[KEY]), %%xmm11, %%xmm11\n\t" \ 4555 "vpxor %%xmm12, %%xmm1, %%xmm1\n\t" \ 4556 "vpxor %%xmm12, %%xmm2, %%xmm2\n\t" \ 4557 "vpxor %%xmm15, %%xmm1, %%xmm1\n\t" \ 4558 "vpxor %%xmm15, %%xmm3, %%xmm3\n\t" \ 4559 "vpxor %%xmm13, %%xmm1, %%xmm1\n\t" \ 4560 4561 #define VAESENC_PCLMUL_L(o) \ 4562 "vpslldq $8, %%xmm1, %%xmm14\n\t" \ 4563 "vpsrldq $8, %%xmm1, %%xmm1\n\t" \ 4564 "vaesenc "#o"(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4565 "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ 4566 "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" \ 4567 "vaesenc "#o"(%[KEY]), %%xmm5, %%xmm5\n\t" \ 4568 "vpslld $31, %%xmm2, %%xmm12\n\t" \ 4569 "vpslld $30, %%xmm2, %%xmm13\n\t" \ 4570 "vpslld $25, %%xmm2, %%xmm14\n\t" \ 4571 "vaesenc "#o"(%[KEY]), %%xmm6, %%xmm6\n\t" \ 4572 "vpxor %%xmm13, %%xmm12, %%xmm12\n\t" \ 4573 "vpxor %%xmm14, %%xmm12, %%xmm12\n\t" \ 4574 "vaesenc "#o"(%[KEY]), %%xmm7, %%xmm7\n\t" \ 4575 "vpsrldq $4, %%xmm12, %%xmm13\n\t" \ 4576 "vpslldq $12, %%xmm12, %%xmm12\n\t" \ 4577 "vaesenc "#o"(%[KEY]), %%xmm8, %%xmm8\n\t" \ 4578 "vpxor %%xmm12, %%xmm2, %%xmm2\n\t" \ 4579 "vpsrld $1, %%xmm2, %%xmm14\n\t" \ 4580 "vaesenc "#o"(%[KEY]), %%xmm9, %%xmm9\n\t" \ 4581 "vpsrld $2, %%xmm2, %%xmm1\n\t" \ 4582 "vpsrld $7, %%xmm2, %%xmm0\n\t" \ 4583 "vaesenc "#o"(%[KEY]), %%xmm10, %%xmm10\n\t" \ 4584 "vpxor %%xmm1, %%xmm14, %%xmm14\n\t" \ 4585 "vpxor %%xmm0, %%xmm14, %%xmm14\n\t" \ 4586 "vaesenc "#o"(%[KEY]), %%xmm11, %%xmm11\n\t" \ 4587 "vpxor %%xmm13, %%xmm14, %%xmm14\n\t" \ 4588 "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ 4589 "vpxor %%xmm3, %%xmm2, %%xmm2\n\t" \ 4590 4591 4592 /* Encrypt and carry-less multiply with last key. */ 4593 #define VAESENC_LAST(in, out) \ 4594 "vaesenclast %%xmm12, %%xmm4, %%xmm4\n\t" \ 4595 "vaesenclast %%xmm12, %%xmm5, %%xmm5\n\t" \ 4596 "vmovdqu (" #in "), %%xmm0\n\t" \ 4597 "vmovdqu 16(" #in "), %%xmm1\n\t" \ 4598 "vpxor %%xmm0, %%xmm4, %%xmm4\n\t" \ 4599 "vpxor %%xmm1, %%xmm5, %%xmm5\n\t" \ 4600 "vmovdqu %%xmm4, (" #out ")\n\t" \ 4601 "vmovdqu %%xmm5, 16(" #out ")\n\t" \ 4602 "vaesenclast %%xmm12, %%xmm6, %%xmm6\n\t" \ 4603 "vaesenclast %%xmm12, %%xmm7, %%xmm7\n\t" \ 4604 "vmovdqu 32(" #in "), %%xmm0\n\t" \ 4605 "vmovdqu 48(" #in "), %%xmm1\n\t" \ 4606 "vpxor %%xmm0, %%xmm6, %%xmm6\n\t" \ 4607 "vpxor %%xmm1, %%xmm7, %%xmm7\n\t" \ 4608 "vmovdqu %%xmm6, 32(" #out ")\n\t" \ 4609 "vmovdqu %%xmm7, 48(" #out ")\n\t" \ 4610 "vaesenclast %%xmm12, %%xmm8, %%xmm8\n\t" \ 4611 "vaesenclast %%xmm12, %%xmm9, %%xmm9\n\t" \ 4612 "vmovdqu 64(" #in "), %%xmm0\n\t" \ 4613 "vmovdqu 80(" #in "), %%xmm1\n\t" \ 4614 "vpxor %%xmm0, %%xmm8, %%xmm8\n\t" \ 4615 "vpxor %%xmm1, %%xmm9, %%xmm9\n\t" \ 4616 "vmovdqu %%xmm8, 64(" #out ")\n\t" \ 4617 "vmovdqu %%xmm9, 80(" #out ")\n\t" \ 4618 "vaesenclast %%xmm12, %%xmm10, %%xmm10\n\t" \ 4619 "vaesenclast %%xmm12, %%xmm11, %%xmm11\n\t" \ 4620 "vmovdqu 96(" #in "), %%xmm0\n\t" \ 4621 "vmovdqu 112(" #in "), %%xmm1\n\t" \ 4622 "vpxor %%xmm0, %%xmm10, %%xmm10\n\t" \ 4623 "vpxor %%xmm1, %%xmm11, %%xmm11\n\t" \ 4624 "vmovdqu %%xmm10, 96(" #out ")\n\t" \ 4625 "vmovdqu %%xmm11, 112(" #out ")\n\t" 4626 4627 #define VAESENC_BLOCK() \ 4628 "vmovdqu " VAR(CTR1) ", %%xmm5\n\t" \ 4629 "vpshufb %[BSWAP_EPI64], %%xmm5, %%xmm4\n\t" \ 4630 "vpaddd %[ONE], %%xmm5, %%xmm5\n\t" \ 4631 "vmovdqu %%xmm5, " VAR(CTR1) "\n\t" \ 4632 "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" \ 4633 "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4634 "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4635 "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4636 "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4637 "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4638 "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4639 "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4640 "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4641 "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4642 "cmpl $11, %[nr]\n\t" \ 4643 "vmovdqa 160(%[KEY]), %%xmm5\n\t" \ 4644 "jl %=f\n\t" \ 4645 "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ 4646 "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4647 "cmpl $13, %[nr]\n\t" \ 4648 "vmovdqa 192(%[KEY]), %%xmm5\n\t" \ 4649 "jl %=f\n\t" \ 4650 "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ 4651 "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4652 "vmovdqa 224(%[KEY]), %%xmm5\n\t" \ 4653 "%=:\n\t" \ 4654 "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t" \ 4655 "vmovdqu (%[in]," VAR(KR64) ",1), %%xmm5\n\t" \ 4656 "vpxor %%xmm5, %%xmm4, %%xmm4\n\t" \ 4657 "vmovdqu %%xmm4, (%[out]," VAR(KR64) ",1)\n\t" \ 4658 "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ 4659 "vpxor %%xmm4, " VAR(XR) ", " VAR(XR) "\n\t" 4660 4661 #define _VAESENC_GFMUL(in, H, X) \ 4662 "vmovdqu " VAR(CTR1) ", %%xmm5\n\t" \ 4663 "vpshufb %[BSWAP_EPI64], %%xmm5, %%xmm4\n\t" \ 4664 "vpaddd %[ONE], %%xmm5, %%xmm5\n\t" \ 4665 "vmovdqu %%xmm5, " VAR(CTR1) "\n\t" \ 4666 "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" \ 4667 "vpclmulqdq $0x10, " #H ", " #X ", %%xmm6\n\t" \ 4668 "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4669 "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4670 "vpclmulqdq $0x01, " #H ", " #X ", %%xmm7\n\t" \ 4671 "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4672 "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4673 "vpclmulqdq $0x00, " #H ", " #X ", %%xmm8\n\t" \ 4674 "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4675 "vpclmulqdq $0x11, " #H ", " #X ", %%xmm1\n\t" \ 4676 "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4677 "vpxor %%xmm7, %%xmm6, %%xmm6\n\t" \ 4678 "vpslldq $8, %%xmm6, %%xmm2\n\t" \ 4679 "vpsrldq $8, %%xmm6, %%xmm6\n\t" \ 4680 "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4681 "vpxor %%xmm8, %%xmm2, %%xmm2\n\t" \ 4682 "vpxor %%xmm6, %%xmm1, %%xmm3\n\t" \ 4683 "vmovdqa %[MOD2_128], %%xmm0\n\t" \ 4684 "vpclmulqdq $0x10, %%xmm0, %%xmm2, %%xmm7\n\t" \ 4685 "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4686 "vpshufd $0x4e, %%xmm2, %%xmm6\n\t" \ 4687 "vpxor %%xmm7, %%xmm6, %%xmm6\n\t" \ 4688 "vpclmulqdq $0x10, %%xmm0, %%xmm6, %%xmm7\n\t" \ 4689 "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4690 "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ 4691 "vpxor %%xmm7, %%xmm6, %%xmm6\n\t" \ 4692 "vpxor %%xmm3, %%xmm6, " VAR(XR) "\n\t" \ 4693 "cmpl $11, %[nr]\n\t" \ 4694 "vmovdqa 160(%[KEY]), %%xmm5\n\t" \ 4695 "jl 1f\n\t" \ 4696 "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ 4697 "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4698 "cmpl $13, %[nr]\n\t" \ 4699 "vmovdqa 192(%[KEY]), %%xmm5\n\t" \ 4700 "jl 1f\n\t" \ 4701 "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ 4702 "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" \ 4703 "vmovdqa 224(%[KEY]), %%xmm5\n\t" \ 4704 "1:\n\t" \ 4705 "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t" \ 4706 "vmovdqu " #in ", %%xmm0\n\t" \ 4707 "vpxor %%xmm0, %%xmm4, %%xmm4\n\t" \ 4708 "vmovdqu %%xmm4, (%[out]," VAR(KR64) ",1)\n\t" 4709 #define VAESENC_GFMUL(in, H, X) \ 4710 _VAESENC_GFMUL(in, H, X) 4711 4712 4713 #define _GHASH_GFMUL_AVX1(r, r2, a, b) \ 4714 "vpshufd $0x4e, "#a", %%xmm1\n\t" \ 4715 "vpshufd $0x4e, "#b", %%xmm2\n\t" \ 4716 "vpclmulqdq $0x11, "#a", "#b", %%xmm3\n\t" \ 4717 "vpclmulqdq $0x00, "#a", "#b", %%xmm0\n\t" \ 4718 "vpxor "#a", %%xmm1, %%xmm1\n\t" \ 4719 "vpxor "#b", %%xmm2, %%xmm2\n\t" \ 4720 "vpclmulqdq $0x00, %%xmm2, %%xmm1, %%xmm1\n\t" \ 4721 "vpxor %%xmm0, %%xmm1, %%xmm1\n\t" \ 4722 "vpxor %%xmm3, %%xmm1, %%xmm1\n\t" \ 4723 "vmovdqa %%xmm0, "#r2"\n\t" \ 4724 "vmovdqa %%xmm3, " #r "\n\t" \ 4725 "vpslldq $8, %%xmm1, %%xmm2\n\t" \ 4726 "vpsrldq $8, %%xmm1, %%xmm1\n\t" \ 4727 "vpxor %%xmm2, "#r2", "#r2"\n\t" \ 4728 "vpxor %%xmm1, " #r ", " #r "\n\t" 4729 #define GHASH_GFMUL_AVX1(r, r2, a, b) \ 4730 _GHASH_GFMUL_AVX1(r, r2, a, b) 4731 4732 #define _GHASH_GFMUL_XOR_AVX1(r, r2, a, b) \ 4733 "vpshufd $0x4e, "#a", %%xmm1\n\t" \ 4734 "vpshufd $0x4e, "#b", %%xmm2\n\t" \ 4735 "vpclmulqdq $0x11, "#a", "#b", %%xmm3\n\t" \ 4736 "vpclmulqdq $0x00, "#a", "#b", %%xmm0\n\t" \ 4737 "vpxor "#a", %%xmm1, %%xmm1\n\t" \ 4738 "vpxor "#b", %%xmm2, %%xmm2\n\t" \ 4739 "vpclmulqdq $0x00, %%xmm2, %%xmm1, %%xmm1\n\t" \ 4740 "vpxor %%xmm0, %%xmm1, %%xmm1\n\t" \ 4741 "vpxor %%xmm3, %%xmm1, %%xmm1\n\t" \ 4742 "vpxor %%xmm0, "#r2", "#r2"\n\t" \ 4743 "vpxor %%xmm3, " #r ", " #r "\n\t" \ 4744 "vpslldq $8, %%xmm1, %%xmm2\n\t" \ 4745 "vpsrldq $8, %%xmm1, %%xmm1\n\t" \ 4746 "vpxor %%xmm2, "#r2", "#r2"\n\t" \ 4747 "vpxor %%xmm1, " #r ", " #r "\n\t" 4748 #define GHASH_GFMUL_XOR_AVX1(r, r2, a, b) \ 4749 _GHASH_GFMUL_XOR_AVX1(r, r2, a, b) 4750 4751 #define GHASH_MID_AVX1(r, r2) \ 4752 "vpsrld $31, "#r2", %%xmm0\n\t" \ 4753 "vpsrld $31, " #r ", %%xmm1\n\t" \ 4754 "vpslld $1, "#r2", "#r2"\n\t" \ 4755 "vpslld $1, " #r ", " #r "\n\t" \ 4756 "vpsrldq $12, %%xmm0, %%xmm2\n\t" \ 4757 "vpslldq $4, %%xmm0, %%xmm0\n\t" \ 4758 "vpslldq $4, %%xmm1, %%xmm1\n\t" \ 4759 "vpor %%xmm2, " #r ", " #r "\n\t" \ 4760 "vpor %%xmm0, "#r2", "#r2"\n\t" \ 4761 "vpor %%xmm1, " #r ", " #r "\n\t" 4762 4763 #define _GHASH_GFMUL_RED_AVX1(r, a, b) \ 4764 "vpshufd $0x4e, "#a", %%xmm5\n\t" \ 4765 "vpshufd $0x4e, "#b", %%xmm6\n\t" \ 4766 "vpclmulqdq $0x11, "#a", "#b", %%xmm7\n\t" \ 4767 "vpclmulqdq $0x00, "#a", "#b", %%xmm4\n\t" \ 4768 "vpxor "#a", %%xmm5, %%xmm5\n\t" \ 4769 "vpxor "#b", %%xmm6, %%xmm6\n\t" \ 4770 "vpclmulqdq $0x00, %%xmm6, %%xmm5, %%xmm5\n\t" \ 4771 "vpxor %%xmm4, %%xmm5, %%xmm5\n\t" \ 4772 "vpxor %%xmm7, %%xmm5, %%xmm5\n\t" \ 4773 "vpslldq $8, %%xmm5, %%xmm6\n\t" \ 4774 "vpsrldq $8, %%xmm5, %%xmm5\n\t" \ 4775 "vpxor %%xmm6, %%xmm4, %%xmm4\n\t" \ 4776 "vpxor %%xmm5, %%xmm7, " #r "\n\t" \ 4777 "vpslld $31, %%xmm4, %%xmm8\n\t" \ 4778 "vpslld $30, %%xmm4, %%xmm9\n\t" \ 4779 "vpslld $25, %%xmm4, %%xmm10\n\t" \ 4780 "vpxor %%xmm9, %%xmm8, %%xmm8\n\t" \ 4781 "vpxor %%xmm10, %%xmm8, %%xmm8\n\t" \ 4782 "vpsrldq $4, %%xmm8, %%xmm9\n\t" \ 4783 "vpslldq $12, %%xmm8, %%xmm8\n\t" \ 4784 "vpxor %%xmm8, %%xmm4, %%xmm4\n\t" \ 4785 "vpsrld $1, %%xmm4, %%xmm10\n\t" \ 4786 "vpsrld $2, %%xmm4, %%xmm6\n\t" \ 4787 "vpsrld $7, %%xmm4, %%xmm5\n\t" \ 4788 "vpxor %%xmm6, %%xmm10, %%xmm10\n\t" \ 4789 "vpxor %%xmm5, %%xmm10, %%xmm10\n\t" \ 4790 "vpxor %%xmm9, %%xmm10, %%xmm10\n\t" \ 4791 "vpxor %%xmm4, %%xmm10, %%xmm10\n\t" \ 4792 "vpxor %%xmm10, " #r ", " #r "\n\t" 4793 #define GHASH_GFMUL_RED_AVX1(r, a, b) \ 4794 _GHASH_GFMUL_RED_AVX1(r, a, b) 4795 4796 #define _GHASH_GFSQR_RED_AVX1(r, a) \ 4797 "vpclmulqdq $0x00, "#a", "#a", %%xmm4\n\t" \ 4798 "vpclmulqdq $0x11, "#a", "#a", " #r "\n\t" \ 4799 "vpslld $31, %%xmm4, %%xmm8\n\t" \ 4800 "vpslld $30, %%xmm4, %%xmm9\n\t" \ 4801 "vpslld $25, %%xmm4, %%xmm10\n\t" \ 4802 "vpxor %%xmm9, %%xmm8, %%xmm8\n\t" \ 4803 "vpxor %%xmm10, %%xmm8, %%xmm8\n\t" \ 4804 "vpsrldq $4, %%xmm8, %%xmm9\n\t" \ 4805 "vpslldq $12, %%xmm8, %%xmm8\n\t" \ 4806 "vpxor %%xmm8, %%xmm4, %%xmm4\n\t" \ 4807 "vpsrld $1, %%xmm4, %%xmm10\n\t" \ 4808 "vpsrld $2, %%xmm4, %%xmm6\n\t" \ 4809 "vpsrld $7, %%xmm4, %%xmm5\n\t" \ 4810 "vpxor %%xmm6, %%xmm10, %%xmm10\n\t" \ 4811 "vpxor %%xmm5, %%xmm10, %%xmm10\n\t" \ 4812 "vpxor %%xmm9, %%xmm10, %%xmm10\n\t" \ 4813 "vpxor %%xmm4, %%xmm10, %%xmm10\n\t" \ 4814 "vpxor %%xmm10, " #r ", " #r "\n\t" 4815 #define GHASH_GFSQR_RED_AVX1(r, a) \ 4816 _GHASH_GFSQR_RED_AVX1(r, a) 4817 4818 #define GHASH_RED_AVX1(r, r2) \ 4819 "vpslld $31, "#r2", %%xmm0\n\t" \ 4820 "vpslld $30, "#r2", %%xmm1\n\t" \ 4821 "vpslld $25, "#r2", %%xmm2\n\t" \ 4822 "vpxor %%xmm1, %%xmm0, %%xmm0\n\t" \ 4823 "vpxor %%xmm2, %%xmm0, %%xmm0\n\t" \ 4824 "vmovdqa %%xmm0, %%xmm1\n\t" \ 4825 "vpsrldq $4, %%xmm1, %%xmm1\n\t" \ 4826 "vpslldq $12, %%xmm0, %%xmm0\n\t" \ 4827 "vpxor %%xmm0, "#r2", "#r2"\n\t" \ 4828 "vpsrld $1, "#r2", %%xmm2\n\t" \ 4829 "vpsrld $2, "#r2", %%xmm3\n\t" \ 4830 "vpsrld $7, "#r2", %%xmm0\n\t" \ 4831 "vpxor %%xmm3, %%xmm2, %%xmm2\n\t" \ 4832 "vpxor %%xmm0, %%xmm2, %%xmm2\n\t" \ 4833 "vpxor %%xmm1, %%xmm2, %%xmm2\n\t" \ 4834 "vpxor "#r2", %%xmm2, %%xmm2\n\t" \ 4835 "vpxor %%xmm2, " #r ", " #r "\n\t" 4836 4837 #define GHASH_GFMUL_RED_XOR_AVX1(r, r2, a, b) \ 4838 GHASH_GFMUL_XOR_AVX1(r, r2, a, b) \ 4839 GHASH_RED_AVX1(r, r2) 4840 4841 #define GHASH_FULL_AVX1(r, r2, a, b) \ 4842 GHASH_GFMUL_AVX1(r, r2, a, b) \ 4843 GHASH_MID_AVX1(r, r2) \ 4844 GHASH_RED_AVX1(r, r2) 4845 4846 #define CALC_IV_12_AVX1() \ 4847 "# Calculate values when IV is 12 bytes\n\t" \ 4848 "# Set counter based on IV\n\t" \ 4849 "movl $0x01000000, %%ecx\n\t" \ 4850 "vpinsrq $0, 0(%%rax), %%xmm13, %%xmm13\n\t" \ 4851 "vpinsrd $2, 8(%%rax), %%xmm13, %%xmm13\n\t" \ 4852 "vpinsrd $3, %%ecx, %%xmm13, %%xmm13\n\t" \ 4853 "# H = Encrypt X(=0) and T = Encrypt counter\n\t" \ 4854 "vmovdqa 0(%[KEY]), " VAR(HR) "\n\t" \ 4855 "vpxor " VAR(HR) ", %%xmm13, %%xmm1\n\t" \ 4856 "vmovdqa 16(%[KEY]), %%xmm12\n\t" \ 4857 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4858 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 4859 "vmovdqa 32(%[KEY]), %%xmm12\n\t" \ 4860 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4861 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 4862 "vmovdqa 48(%[KEY]), %%xmm12\n\t" \ 4863 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4864 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 4865 "vmovdqa 64(%[KEY]), %%xmm12\n\t" \ 4866 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4867 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 4868 "vmovdqa 80(%[KEY]), %%xmm12\n\t" \ 4869 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4870 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 4871 "vmovdqa 96(%[KEY]), %%xmm12\n\t" \ 4872 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4873 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 4874 "vmovdqa 112(%[KEY]), %%xmm12\n\t" \ 4875 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4876 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 4877 "vmovdqa 128(%[KEY]), %%xmm12\n\t" \ 4878 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4879 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 4880 "vmovdqa 144(%[KEY]), %%xmm12\n\t" \ 4881 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4882 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 4883 "cmpl $11, %[nr]\n\t" \ 4884 "vmovdqa 160(%[KEY]), %%xmm12\n\t" \ 4885 "jl 31f\n\t" \ 4886 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4887 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 4888 "vmovdqa 176(%[KEY]), %%xmm12\n\t" \ 4889 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4890 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 4891 "cmpl $13, %[nr]\n\t" \ 4892 "vmovdqa 192(%[KEY]), %%xmm12\n\t" \ 4893 "jl 31f\n\t" \ 4894 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4895 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 4896 "vmovdqa 208(%[KEY]), %%xmm12\n\t" \ 4897 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4898 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 4899 "vmovdqu 224(%[KEY]), %%xmm12\n\t" \ 4900 "31:\n\t" \ 4901 "vaesenclast %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 4902 "vaesenclast %%xmm12, %%xmm1, %%xmm1\n\t" \ 4903 "vpshufb %[BSWAP_MASK], " VAR(HR) ", " VAR(HR) "\n\t" \ 4904 "vmovdqu %%xmm1, " VAR(TR) "\n\t" \ 4905 "jmp 39f\n\t" 4906 4907 #define CALC_IV_AVX1() \ 4908 "# Calculate values when IV is not 12 bytes\n\t" \ 4909 "# H = Encrypt X(=0)\n\t" \ 4910 "vmovdqa 0(%[KEY]), " VAR(HR) "\n\t" \ 4911 VAESENC_AVX(HR) \ 4912 "vpshufb %[BSWAP_MASK], " VAR(HR) ", " VAR(HR) "\n\t" \ 4913 "# Calc counter\n\t" \ 4914 "# Initialization vector\n\t" \ 4915 "cmpl $0, %%edx\n\t" \ 4916 "movq $0, %%rcx\n\t" \ 4917 "je 45f\n\t" \ 4918 "cmpl $16, %%edx\n\t" \ 4919 "jl 44f\n\t" \ 4920 "andl $0xfffffff0, %%edx\n\t" \ 4921 "\n" \ 4922 "43:\n\t" \ 4923 "vmovdqu (%%rax,%%rcx,1), %%xmm4\n\t" \ 4924 "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ 4925 "vpxor %%xmm4, %%xmm13, %%xmm13\n\t" \ 4926 GHASH_FULL_AVX1(%%xmm13, %%xmm12, %%xmm13, HR) \ 4927 "addl $16, %%ecx\n\t" \ 4928 "cmpl %%edx, %%ecx\n\t" \ 4929 "jl 43b\n\t" \ 4930 "movl %[ibytes], %%edx\n\t" \ 4931 "cmpl %%edx, %%ecx\n\t" \ 4932 "je 45f\n\t" \ 4933 "\n" \ 4934 "44:\n\t" \ 4935 "subq $16, %%rsp\n\t" \ 4936 "vpxor %%xmm4, %%xmm4, %%xmm4\n\t" \ 4937 "xorl %%ebx, %%ebx\n\t" \ 4938 "vmovdqu %%xmm4, (%%rsp)\n\t" \ 4939 "42:\n\t" \ 4940 "movzbl (%%rax,%%rcx,1), %%r13d\n\t" \ 4941 "movb %%r13b, (%%rsp,%%rbx,1)\n\t" \ 4942 "incl %%ecx\n\t" \ 4943 "incl %%ebx\n\t" \ 4944 "cmpl %%edx, %%ecx\n\t" \ 4945 "jl 42b\n\t" \ 4946 "vmovdqu (%%rsp), %%xmm4\n\t" \ 4947 "addq $16, %%rsp\n\t" \ 4948 "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ 4949 "vpxor %%xmm4, %%xmm13, %%xmm13\n\t" \ 4950 GHASH_FULL_AVX1(%%xmm13, %%xmm12, %%xmm13, HR) \ 4951 "\n" \ 4952 "45:\n\t" \ 4953 "# T = Encrypt counter\n\t" \ 4954 "vpxor %%xmm0, %%xmm0, %%xmm0\n\t" \ 4955 "shll $3, %%edx\n\t" \ 4956 "vpinsrq $0, %%rdx, %%xmm0, %%xmm0\n\t" \ 4957 "vpxor %%xmm0, %%xmm13, %%xmm13\n\t" \ 4958 GHASH_FULL_AVX1(%%xmm13, %%xmm12, %%xmm13, HR) \ 4959 "vpshufb %[BSWAP_MASK], %%xmm13, %%xmm13\n\t" \ 4960 "# Encrypt counter\n\t" \ 4961 "vmovdqa 0(%[KEY]), %%xmm4\n\t" \ 4962 "vpxor %%xmm13, %%xmm4, %%xmm4\n\t" \ 4963 VAESENC_AVX(%%xmm4) \ 4964 "vmovdqu %%xmm4, " VAR(TR) "\n\t" 4965 4966 #define CALC_AAD_AVX1() \ 4967 "# Additional authentication data\n\t" \ 4968 "movl %[abytes], %%edx\n\t" \ 4969 "cmpl $0, %%edx\n\t" \ 4970 "je 25f\n\t" \ 4971 "movq %[addt], %%rax\n\t" \ 4972 "xorl %%ecx, %%ecx\n\t" \ 4973 "cmpl $16, %%edx\n\t" \ 4974 "jl 24f\n\t" \ 4975 "andl $0xfffffff0, %%edx\n\t" \ 4976 "\n" \ 4977 "23:\n\t" \ 4978 "vmovdqu (%%rax,%%rcx,1), %%xmm4\n\t" \ 4979 "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ 4980 "vpxor %%xmm4, " VAR(XR) ", " VAR(XR) "\n\t" \ 4981 GHASH_FULL_AVX1(XR, %%xmm12, XR, HR) \ 4982 "addl $16, %%ecx\n\t" \ 4983 "cmpl %%edx, %%ecx\n\t" \ 4984 "jl 23b\n\t" \ 4985 "movl %[abytes], %%edx\n\t" \ 4986 "cmpl %%edx, %%ecx\n\t" \ 4987 "je 25f\n\t" \ 4988 "\n" \ 4989 "24:\n\t" \ 4990 "subq $16, %%rsp\n\t" \ 4991 "vpxor %%xmm4, %%xmm4, %%xmm4\n\t" \ 4992 "xorl %%ebx, %%ebx\n\t" \ 4993 "vmovdqu %%xmm4, (%%rsp)\n\t" \ 4994 "22:\n\t" \ 4995 "movzbl (%%rax,%%rcx,1), %%r13d\n\t" \ 4996 "movb %%r13b, (%%rsp,%%rbx,1)\n\t" \ 4997 "incl %%ecx\n\t" \ 4998 "incl %%ebx\n\t" \ 4999 "cmpl %%edx, %%ecx\n\t" \ 5000 "jl 22b\n\t" \ 5001 "vmovdqu (%%rsp), %%xmm4\n\t" \ 5002 "addq $16, %%rsp\n\t" \ 5003 "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ 5004 "vpxor %%xmm4, " VAR(XR) ", " VAR(XR) "\n\t" \ 5005 GHASH_FULL_AVX1(XR, %%xmm12, XR, HR) \ 5006 "\n" \ 5007 "25:\n\t" 5008 5009 #define CALC_HT_8_AVX1() \ 5010 "vmovdqa " VAR(XR) ", %%xmm2\n\t" \ 5011 "# H ^ 1\n\t" \ 5012 "vmovdqu " VAR(HR) ", 0(" VAR(HTR) ")\n\t" \ 5013 "# H ^ 2\n\t" \ 5014 GHASH_GFSQR_RED_AVX1(%%xmm0, HR) \ 5015 "vmovdqu %%xmm0 , 16(" VAR(HTR) ")\n\t" \ 5016 "# H ^ 3\n\t" \ 5017 GHASH_GFMUL_RED_AVX1(%%xmm1, HR, %%xmm0) \ 5018 "vmovdqu %%xmm1 , 32(" VAR(HTR) ")\n\t" \ 5019 "# H ^ 4\n\t" \ 5020 GHASH_GFSQR_RED_AVX1(%%xmm3, %%xmm0) \ 5021 "vmovdqu %%xmm3 , 48(" VAR(HTR) ")\n\t" \ 5022 "# H ^ 5\n\t" \ 5023 GHASH_GFMUL_RED_AVX1(%%xmm12, %%xmm0, %%xmm1) \ 5024 "vmovdqu %%xmm12, 64(" VAR(HTR) ")\n\t" \ 5025 "# H ^ 6\n\t" \ 5026 GHASH_GFSQR_RED_AVX1(%%xmm12, %%xmm1) \ 5027 "vmovdqu %%xmm12, 80(" VAR(HTR) ")\n\t" \ 5028 "# H ^ 7\n\t" \ 5029 GHASH_GFMUL_RED_AVX1(%%xmm12, %%xmm1, %%xmm3) \ 5030 "vmovdqu %%xmm12, 96(" VAR(HTR) ")\n\t" \ 5031 "# H ^ 8\n\t" \ 5032 GHASH_GFSQR_RED_AVX1(%%xmm12, %%xmm3) \ 5033 "vmovdqu %%xmm12, 112(" VAR(HTR) ")\n\t" 5034 5035 #define VAESENC_128_GHASH_AVX1(src, o) \ 5036 "leaq (%[in]," VAR(KR64) ",1), %%rcx\n\t" \ 5037 "leaq (%[out]," VAR(KR64) ",1), %%rdx\n\t" \ 5038 /* src is either %%rcx or %%rdx */ \ 5039 VAESENC_CTR() \ 5040 VAESENC_XOR() \ 5041 VAESENC_PCLMUL_1(src, 16, (o-128), 112) \ 5042 VAESENC_PCLMUL_N(src, 32, (o-112), 96) \ 5043 VAESENC_PCLMUL_N(src, 48, (o- 96), 80) \ 5044 VAESENC_PCLMUL_N(src, 64, (o- 80), 64) \ 5045 VAESENC_PCLMUL_N(src, 80, (o- 64), 48) \ 5046 VAESENC_PCLMUL_N(src, 96, (o- 48), 32) \ 5047 VAESENC_PCLMUL_N(src, 112, (o- 32), 16) \ 5048 VAESENC_PCLMUL_N(src, 128, (o- 16), 0) \ 5049 VAESENC_PCLMUL_L(144) \ 5050 "cmpl $11, %[nr]\n\t" \ 5051 "vmovdqa 160(%[KEY]), %%xmm12\n\t" \ 5052 "jl 4f\n\t" \ 5053 VAESENC() \ 5054 VAESENC_SET(176) \ 5055 "cmpl $13, %[nr]\n\t" \ 5056 "vmovdqa 192(%[KEY]), %%xmm12\n\t" \ 5057 "jl 4f\n\t" \ 5058 VAESENC() \ 5059 VAESENC_SET(208) \ 5060 "vmovdqa 224(%[KEY]), %%xmm12\n\t" \ 5061 "\n" \ 5062 "4:\n\t" \ 5063 VAESENC_LAST(%%rcx, %%rdx) 5064 5065 #define _VAESENC_AVX(r) \ 5066 "vaesenc 16(%[KEY]), " #r ", " #r "\n\t" \ 5067 "vaesenc 32(%[KEY]), " #r ", " #r "\n\t" \ 5068 "vaesenc 48(%[KEY]), " #r ", " #r "\n\t" \ 5069 "vaesenc 64(%[KEY]), " #r ", " #r "\n\t" \ 5070 "vaesenc 80(%[KEY]), " #r ", " #r "\n\t" \ 5071 "vaesenc 96(%[KEY]), " #r ", " #r "\n\t" \ 5072 "vaesenc 112(%[KEY]), " #r ", " #r "\n\t" \ 5073 "vaesenc 128(%[KEY]), " #r ", " #r "\n\t" \ 5074 "vaesenc 144(%[KEY]), " #r ", " #r "\n\t" \ 5075 "cmpl $11, %[nr]\n\t" \ 5076 "vmovdqa 160(%[KEY]), %%xmm5\n\t" \ 5077 "jl %=f\n\t" \ 5078 "vaesenc %%xmm5, " #r ", " #r "\n\t" \ 5079 "vaesenc 176(%[KEY]), " #r ", " #r "\n\t" \ 5080 "cmpl $13, %[nr]\n\t" \ 5081 "vmovdqa 192(%[KEY]), %%xmm5\n\t" \ 5082 "jl %=f\n\t" \ 5083 "vaesenc %%xmm5, " #r ", " #r "\n\t" \ 5084 "vaesenc 208(%[KEY]), " #r ", " #r "\n\t" \ 5085 "vmovdqa 224(%[KEY]), %%xmm5\n\t" \ 5086 "%=:\n\t" \ 5087 "vaesenclast %%xmm5, " #r ", " #r "\n\t" 5088 #define VAESENC_AVX(r) \ 5089 _VAESENC_AVX(r) 5090 5091 #define AESENC_LAST15_ENC_AVX1() \ 5092 "movl %[nbytes], %%ecx\n\t" \ 5093 "movl %%ecx, %%edx\n\t" \ 5094 "andl $0x0f, %%ecx\n\t" \ 5095 "jz 55f\n\t" \ 5096 "vmovdqu " VAR(CTR1) ", %%xmm13\n\t" \ 5097 "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" \ 5098 "vpxor 0(%[KEY]), %%xmm13, %%xmm13\n\t" \ 5099 VAESENC_AVX(%%xmm13) \ 5100 "subq $16, %%rsp\n\t" \ 5101 "xorl %%ecx, %%ecx\n\t" \ 5102 "vmovdqu %%xmm13, (%%rsp)\n\t" \ 5103 "\n" \ 5104 "51:\n\t" \ 5105 "movzbl (%[in]," VAR(KR64) ",1), %%r13d\n\t" \ 5106 "xorb (%%rsp,%%rcx,1), %%r13b\n\t" \ 5107 "movb %%r13b, (%[out]," VAR(KR64) ",1)\n\t" \ 5108 "movb %%r13b, (%%rsp,%%rcx,1)\n\t" \ 5109 "incl " VAR(KR) "\n\t" \ 5110 "incl %%ecx\n\t" \ 5111 "cmpl %%edx, " VAR(KR) "\n\t" \ 5112 "jl 51b\n\t" \ 5113 "xorq %%r13, %%r13\n\t" \ 5114 "cmpl $16, %%ecx\n\t" \ 5115 "je 53f\n\t" \ 5116 "\n" \ 5117 "52:\n\t" \ 5118 "movb %%r13b, (%%rsp,%%rcx,1)\n\t" \ 5119 "incl %%ecx\n\t" \ 5120 "cmpl $16, %%ecx\n\t" \ 5121 "jl 52b\n\t" \ 5122 "53:\n\t" \ 5123 "vmovdqu (%%rsp), %%xmm13\n\t" \ 5124 "addq $16, %%rsp\n\t" \ 5125 "vpshufb %[BSWAP_MASK], %%xmm13, %%xmm13\n\t" \ 5126 "vpxor %%xmm13, " VAR(XR) ", " VAR(XR) "\n\t" \ 5127 GHASH_GFMUL_RED_AVX1(XR, HR, XR) \ 5128 5129 #define AESENC_LAST15_DEC_AVX1() \ 5130 "movl %[nbytes], %%ecx\n\t" \ 5131 "movl %%ecx, %%edx\n\t" \ 5132 "andl $0x0f, %%ecx\n\t" \ 5133 "jz 55f\n\t" \ 5134 "vmovdqu " VAR(CTR1) ", %%xmm13\n\t" \ 5135 "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" \ 5136 "vpxor 0(%[KEY]), %%xmm13, %%xmm13\n\t" \ 5137 VAESENC_AVX(%%xmm13) \ 5138 "subq $32, %%rsp\n\t" \ 5139 "xorl %%ecx, %%ecx\n\t" \ 5140 "vmovdqu %%xmm13, (%%rsp)\n\t" \ 5141 "vpxor %%xmm0, %%xmm0, %%xmm0\n\t" \ 5142 "vmovdqu %%xmm0, 16(%%rsp)\n\t" \ 5143 "\n" \ 5144 "51:\n\t" \ 5145 "movzbl (%[in]," VAR(KR64) ",1), %%r13d\n\t" \ 5146 "movb %%r13b, 16(%%rsp,%%rcx,1)\n\t" \ 5147 "xorb (%%rsp,%%rcx,1), %%r13b\n\t" \ 5148 "movb %%r13b, (%[out]," VAR(KR64) ",1)\n\t" \ 5149 "incl " VAR(KR) "\n\t" \ 5150 "incl %%ecx\n\t" \ 5151 "cmpl %%edx, " VAR(KR) "\n\t" \ 5152 "jl 51b\n\t" \ 5153 "53:\n\t" \ 5154 "vmovdqu 16(%%rsp), %%xmm13\n\t" \ 5155 "addq $32, %%rsp\n\t" \ 5156 "vpshufb %[BSWAP_MASK], %%xmm13, %%xmm13\n\t" \ 5157 "vpxor %%xmm13, " VAR(XR) ", " VAR(XR) "\n\t" \ 5158 GHASH_GFMUL_RED_AVX1(XR, HR, XR) \ 5159 5160 #define CALC_TAG_AVX1() \ 5161 "movl %[nbytes], %%edx\n\t" \ 5162 "movl %[abytes], %%ecx\n\t" \ 5163 "shlq $3, %%rdx\n\t" \ 5164 "shlq $3, %%rcx\n\t" \ 5165 "vpinsrq $0, %%rdx, %%xmm0, %%xmm0\n\t" \ 5166 "vpinsrq $1, %%rcx, %%xmm0, %%xmm0\n\t" \ 5167 "vpxor %%xmm0, " VAR(XR) ", " VAR(XR) "\n\t" \ 5168 GHASH_GFMUL_RED_AVX1(XR, HR, XR) \ 5169 "vpshufb %[BSWAP_MASK], " VAR(XR) ", " VAR(XR) "\n\t" \ 5170 "vpxor " VAR(TR) ", " VAR(XR) ", %%xmm0\n\t" \ 5171 5172 #define STORE_TAG_AVX() \ 5173 "cmpl $16, %[tbytes]\n\t" \ 5174 "je 71f\n\t" \ 5175 "xorq %%rcx, %%rcx\n\t" \ 5176 "vmovdqu %%xmm0, (%%rsp)\n\t" \ 5177 "73:\n\t" \ 5178 "movzbl (%%rsp,%%rcx,1), %%r13d\n\t" \ 5179 "movb %%r13b, (%[tag],%%rcx,1)\n\t" \ 5180 "incl %%ecx\n\t" \ 5181 "cmpl %[tbytes], %%ecx\n\t" \ 5182 "jne 73b\n\t" \ 5183 "jmp 72f\n\t" \ 5184 "\n" \ 5185 "71:\n\t" \ 5186 "vmovdqu %%xmm0, (%[tag])\n\t" \ 5187 "\n" \ 5188 "72:\n\t" 5189 5190 #define CMP_TAG_AVX() \ 5191 "cmpl $16, %[tbytes]\n\t" \ 5192 "je 71f\n\t" \ 5193 "subq $16, %%rsp\n\t" \ 5194 "xorq %%rcx, %%rcx\n\t" \ 5195 "xorq %%rax, %%rax\n\t" \ 5196 "vmovdqu %%xmm0, (%%rsp)\n\t" \ 5197 "\n" \ 5198 "73:\n\t" \ 5199 "movzbl (%%rsp,%%rcx,1), %%r13d\n\t" \ 5200 "xorb (%[tag],%%rcx,1), %%r13b\n\t" \ 5201 "orb %%r13b, %%al\n\t" \ 5202 "incl %%ecx\n\t" \ 5203 "cmpl %[tbytes], %%ecx\n\t" \ 5204 "jne 73b\n\t" \ 5205 "cmpb $0x00, %%al\n\t" \ 5206 "sete %%al\n\t" \ 5207 "addq $16, %%rsp\n\t" \ 5208 "jmp 72f\n\t" \ 5209 "\n" \ 5210 "71:\n\t" \ 5211 "vmovdqu (%[tag]), %%xmm1\n\t" \ 5212 "vpcmpeqb %%xmm1, %%xmm0, %%xmm0\n\t" \ 5213 "vpmovmskb %%xmm0, %%edx\n\t" \ 5214 "# %%edx == 0xFFFF then return 1 else => return 0\n\t" \ 5215 "xorl %%eax, %%eax\n\t" \ 5216 "cmpl $0xffff, %%edx\n\t" \ 5217 "sete %%al\n\t" \ 5218 "\n" \ 5219 "72:\n\t" \ 5220 "movl %%eax, (%[res])\n\t" 5221 5222 static void AES_GCM_encrypt_avx1(const unsigned char *in, unsigned char *out, 5223 const unsigned char* addt, 5224 const unsigned char* ivec, unsigned char *tag, 5225 unsigned int nbytes, unsigned int abytes, 5226 unsigned int ibytes, unsigned int tbytes, 5227 const unsigned char* key, int nr) 5228 { 5229 register const unsigned char* iv asm("rax") = ivec; 5230 register unsigned int ivLen asm("ebx") = ibytes; 5231 5232 __asm__ __volatile__ ( 5233 "subq $" VAR(STACK_OFFSET) ", %%rsp\n\t" 5234 /* Counter is xmm13 */ 5235 "vpxor %%xmm13, %%xmm13, %%xmm13\n\t" 5236 "vpxor " VAR(XR) ", " VAR(XR) ", " VAR(XR) "\n\t" 5237 "movl %[ibytes], %%edx\n\t" 5238 "cmpl $12, %%edx\n\t" 5239 "jne 35f\n\t" 5240 CALC_IV_12_AVX1() 5241 "\n" 5242 "35:\n\t" 5243 CALC_IV_AVX1() 5244 "\n" 5245 "39:\n\t" 5246 5247 CALC_AAD_AVX1() 5248 5249 "# Calculate counter and H\n\t" 5250 "vpsrlq $63, " VAR(HR) ", %%xmm5\n\t" 5251 "vpsllq $1, " VAR(HR) ", %%xmm4\n\t" 5252 "vpslldq $8, %%xmm5, %%xmm5\n\t" 5253 "vpor %%xmm5, %%xmm4, %%xmm4\n\t" 5254 "vpshufd $0xff, " VAR(HR) ", " VAR(HR) "\n\t" 5255 "vpsrad $31, " VAR(HR) ", " VAR(HR) "\n\t" 5256 "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" 5257 "vpand %[MOD2_128], " VAR(HR) ", " VAR(HR) "\n\t" 5258 "vpaddd %[ONE], %%xmm13, %%xmm13\n\t" 5259 "vpxor %%xmm4, " VAR(HR) ", " VAR(HR) "\n\t" 5260 "vmovdqu %%xmm13, " VAR(CTR1) "\n\t" 5261 5262 "xorl " VAR(KR) ", " VAR(KR) "\n\t" 5263 5264 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) 5265 "cmpl $128, %[nbytes]\n\t" 5266 "movl %[nbytes], %%r13d\n\t" 5267 "jl 5f\n\t" 5268 "andl $0xffffff80, %%r13d\n\t" 5269 5270 CALC_HT_8_AVX1() 5271 5272 "# First 128 bytes of input\n\t" 5273 VAESENC_128() 5274 5275 "cmpl $128, %%r13d\n\t" 5276 "movl $128, " VAR(KR) "\n\t" 5277 "jle 2f\n\t" 5278 5279 "# More 128 bytes of input\n\t" 5280 "\n" 5281 "3:\n\t" 5282 VAESENC_128_GHASH_AVX1(%%rdx, 0) 5283 "addl $128, " VAR(KR) "\n\t" 5284 "cmpl %%r13d, " VAR(KR) "\n\t" 5285 "jl 3b\n\t" 5286 "\n" 5287 "2:\n\t" 5288 "vmovdqa %[BSWAP_MASK], %%xmm13\n\t" 5289 "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t" 5290 "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t" 5291 "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t" 5292 "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t" 5293 "vpxor %%xmm2, %%xmm4, %%xmm4\n\t" 5294 "vpshufb %%xmm13, %%xmm8, %%xmm8\n\t" 5295 "vpshufb %%xmm13, %%xmm9, %%xmm9\n\t" 5296 "vpshufb %%xmm13, %%xmm10, %%xmm10\n\t" 5297 "vpshufb %%xmm13, %%xmm11, %%xmm11\n\t" 5298 5299 "vmovdqu (" VAR(HTR) "), %%xmm12\n\t" 5300 "vmovdqu 16(" VAR(HTR) "), %%xmm14\n\t" 5301 GHASH_GFMUL_AVX1(XR, %%xmm13, %%xmm11, %%xmm12) 5302 GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm10, %%xmm14) 5303 "vmovdqu 32(" VAR(HTR) "), %%xmm12\n\t" 5304 "vmovdqu 48(" VAR(HTR) "), %%xmm14\n\t" 5305 GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm9, %%xmm12) 5306 GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm8, %%xmm14) 5307 "vmovdqu 64(" VAR(HTR) "), %%xmm12\n\t" 5308 "vmovdqu 80(" VAR(HTR) "), %%xmm14\n\t" 5309 GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm7, %%xmm12) 5310 GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm6, %%xmm14) 5311 "vmovdqu 96(" VAR(HTR) "), %%xmm12\n\t" 5312 "vmovdqu 112(" VAR(HTR) "), %%xmm14\n\t" 5313 GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm5, %%xmm12) 5314 GHASH_GFMUL_RED_XOR_AVX1(XR, %%xmm13, %%xmm4, %%xmm14) 5315 5316 "vmovdqu 0(" VAR(HTR) "), " VAR(HR) "\n\t" 5317 "\n" 5318 "5:\n\t" 5319 "movl %[nbytes], %%edx\n\t" 5320 "cmpl %%edx, " VAR(KR) "\n\t" 5321 "jge 55f\n\t" 5322 #endif 5323 5324 "movl %[nbytes], %%r13d\n\t" 5325 "andl $0xfffffff0, %%r13d\n\t" 5326 "cmpl %%r13d, " VAR(KR) "\n\t" 5327 "jge 14f\n\t" 5328 5329 VAESENC_BLOCK() 5330 "addl $16, " VAR(KR) "\n\t" 5331 "cmpl %%r13d, " VAR(KR) "\n\t" 5332 "jge 13f\n\t" 5333 "\n" 5334 "12:\n\t" 5335 "vmovdqu (%[in]," VAR(KR64) ",1), %%xmm9\n\t" 5336 VAESENC_GFMUL(%%xmm9, HR, XR) 5337 "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" 5338 "addl $16, " VAR(KR) "\n\t" 5339 "vpxor %%xmm4, " VAR(XR) ", " VAR(XR) "\n\t" 5340 "cmpl %%r13d, " VAR(KR) "\n\t" 5341 "jl 12b\n\t" 5342 "\n" 5343 "13:\n\t" 5344 GHASH_GFMUL_RED_AVX1(XR, HR, XR) 5345 "\n" 5346 "14:\n\t" 5347 5348 AESENC_LAST15_ENC_AVX1() 5349 "\n" 5350 "55:\n\t" 5351 5352 CALC_TAG_AVX1() 5353 STORE_TAG_AVX() 5354 "addq $" VAR(STACK_OFFSET) ", %%rsp\n\t" 5355 "vzeroupper\n\t" 5356 5357 : 5358 : [KEY] "r" (key), 5359 [in] "r" (in), [out] "r" (out), [nr] "r" (nr), 5360 [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt), 5361 [ivec] "r" (iv), [ibytes] "r" (ivLen), [tbytes] "r" (tbytes), 5362 [tag] "r" (tag), 5363 [BSWAP_MASK] "m" (BSWAP_MASK), 5364 [BSWAP_EPI64] "m" (BSWAP_EPI64), 5365 [ONE] "m" (ONE), 5366 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) 5367 [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR), 5368 [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN), 5369 [EIGHT] "m" (EIGHT), 5370 #endif 5371 [MOD2_128] "m" (MOD2_128) 5372 : "xmm15", "xmm14", "xmm13", "xmm12", 5373 "xmm0", "xmm1", "xmm2", "xmm3", "memory", 5374 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", 5375 "rcx", "rdx", "r13" 5376 ); 5377 } 5378 5379 #ifdef HAVE_INTEL_AVX2 5380 /* Encrypt and carry-less multiply for AVX2. */ 5381 #define VAESENC_PCLMUL_AVX2_1(src, o1, o2, o3) \ 5382 "vmovdqu " #o2 "(" #src "), %%xmm12\n\t" \ 5383 "vmovdqa " #o1 "(%[KEY]), %%xmm0\n\t" \ 5384 "vpshufb %[BSWAP_MASK], %%xmm12, %%xmm12\n\t" \ 5385 "vmovdqu " #o3 "(" VAR(HTR) "), %%xmm13\n\t" \ 5386 "vpxor %%xmm2, %%xmm12, %%xmm12\n\t" \ 5387 "vpclmulqdq $0x10, %%xmm13, %%xmm12, %%xmm1\n\t" \ 5388 "vpclmulqdq $0x01, %%xmm13, %%xmm12, %%xmm14\n\t" \ 5389 "vpclmulqdq $0x00, %%xmm13, %%xmm12, %%xmm2\n\t" \ 5390 "vpclmulqdq $0x11, %%xmm13, %%xmm12, %%xmm3\n\t" \ 5391 "vaesenc %%xmm0, %%xmm4, %%xmm4\n\t" \ 5392 "vaesenc %%xmm0, %%xmm5, %%xmm5\n\t" \ 5393 "vaesenc %%xmm0, %%xmm6, %%xmm6\n\t" \ 5394 "vaesenc %%xmm0, %%xmm7, %%xmm7\n\t" \ 5395 "vaesenc %%xmm0, %%xmm8, %%xmm8\n\t" \ 5396 "vaesenc %%xmm0, %%xmm9, %%xmm9\n\t" \ 5397 "vaesenc %%xmm0, %%xmm10, %%xmm10\n\t" \ 5398 "vaesenc %%xmm0, %%xmm11, %%xmm11\n\t" \ 5399 5400 #define VAESENC_PCLMUL_AVX2_2(src, o1, o2, o3) \ 5401 "vmovdqu " #o2 "(" #src "), %%xmm12\n\t" \ 5402 "vmovdqu " #o3 "(" VAR(HTR) "), %%xmm0\n\t" \ 5403 "vpshufb %[BSWAP_MASK], %%xmm12, %%xmm12\n\t" \ 5404 "vpxor %%xmm14, %%xmm1, %%xmm1\n\t" \ 5405 "vpclmulqdq $0x10, %%xmm0, %%xmm12, %%xmm13\n\t" \ 5406 "vpclmulqdq $0x01, %%xmm0, %%xmm12, %%xmm14\n\t" \ 5407 "vpclmulqdq $0x00, %%xmm0, %%xmm12, %%xmm15\n\t" \ 5408 "vpclmulqdq $0x11, %%xmm0, %%xmm12, %%xmm12\n\t" \ 5409 "vmovdqa " #o1 "(%[KEY]), %%xmm0\n\t" \ 5410 "vpxor %%xmm13, %%xmm1, %%xmm1\n\t" \ 5411 "vpxor %%xmm12, %%xmm3, %%xmm3\n\t" \ 5412 "vaesenc %%xmm0, %%xmm4, %%xmm4\n\t" \ 5413 "vaesenc %%xmm0, %%xmm5, %%xmm5\n\t" \ 5414 "vaesenc %%xmm0, %%xmm6, %%xmm6\n\t" \ 5415 "vaesenc %%xmm0, %%xmm7, %%xmm7\n\t" \ 5416 "vaesenc %%xmm0, %%xmm8, %%xmm8\n\t" \ 5417 "vaesenc %%xmm0, %%xmm9, %%xmm9\n\t" \ 5418 "vaesenc %%xmm0, %%xmm10, %%xmm10\n\t" \ 5419 "vaesenc %%xmm0, %%xmm11, %%xmm11\n\t" \ 5420 5421 #define VAESENC_PCLMUL_AVX2_N(src, o1, o2, o3) \ 5422 "vmovdqu " #o2 "(" #src "), %%xmm12\n\t" \ 5423 "vmovdqu " #o3 "(" VAR(HTR) "), %%xmm0\n\t" \ 5424 "vpshufb %[BSWAP_MASK], %%xmm12, %%xmm12\n\t" \ 5425 "vpxor %%xmm14, %%xmm1, %%xmm1\n\t" \ 5426 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" \ 5427 "vpclmulqdq $0x10, %%xmm0, %%xmm12, %%xmm13\n\t" \ 5428 "vpclmulqdq $0x01, %%xmm0, %%xmm12, %%xmm14\n\t" \ 5429 "vpclmulqdq $0x00, %%xmm0, %%xmm12, %%xmm15\n\t" \ 5430 "vpclmulqdq $0x11, %%xmm0, %%xmm12, %%xmm12\n\t" \ 5431 "vmovdqa " #o1 "(%[KEY]), %%xmm0\n\t" \ 5432 "vpxor %%xmm13, %%xmm1, %%xmm1\n\t" \ 5433 "vpxor %%xmm12, %%xmm3, %%xmm3\n\t" \ 5434 "vaesenc %%xmm0, %%xmm4, %%xmm4\n\t" \ 5435 "vaesenc %%xmm0, %%xmm5, %%xmm5\n\t" \ 5436 "vaesenc %%xmm0, %%xmm6, %%xmm6\n\t" \ 5437 "vaesenc %%xmm0, %%xmm7, %%xmm7\n\t" \ 5438 "vaesenc %%xmm0, %%xmm8, %%xmm8\n\t" \ 5439 "vaesenc %%xmm0, %%xmm9, %%xmm9\n\t" \ 5440 "vaesenc %%xmm0, %%xmm10, %%xmm10\n\t" \ 5441 "vaesenc %%xmm0, %%xmm11, %%xmm11\n\t" \ 5442 5443 #define VAESENC_PCLMUL_AVX2_L(o) \ 5444 "vpxor %%xmm14, %%xmm1, %%xmm1\n\t" \ 5445 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" \ 5446 "vpslldq $8, %%xmm1, %%xmm12\n\t" \ 5447 "vpsrldq $8, %%xmm1, %%xmm1\n\t" \ 5448 "vmovdqa "#o"(%[KEY]), %%xmm15\n\t" \ 5449 "vmovdqa %[MOD2_128], %%xmm0\n\t" \ 5450 "vaesenc %%xmm15, %%xmm4, %%xmm4\n\t" \ 5451 "vpxor %%xmm12, %%xmm2, %%xmm2\n\t" \ 5452 "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" \ 5453 "vpclmulqdq $0x10, %%xmm0, %%xmm2, %%xmm14\n\t" \ 5454 "vaesenc %%xmm15, %%xmm5, %%xmm5\n\t" \ 5455 "vaesenc %%xmm15, %%xmm6, %%xmm6\n\t" \ 5456 "vaesenc %%xmm15, %%xmm7, %%xmm7\n\t" \ 5457 "vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \ 5458 "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ 5459 "vpclmulqdq $0x10, %%xmm0, %%xmm2, %%xmm14\n\t" \ 5460 "vaesenc %%xmm15, %%xmm8, %%xmm8\n\t" \ 5461 "vaesenc %%xmm15, %%xmm9, %%xmm9\n\t" \ 5462 "vaesenc %%xmm15, %%xmm10, %%xmm10\n\t" \ 5463 "vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \ 5464 "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ 5465 "vpxor %%xmm3, %%xmm2, %%xmm2\n\t" \ 5466 "vaesenc %%xmm15, %%xmm11, %%xmm11\n\t" 5467 5468 #define VAESENC_BLOCK_AVX2() \ 5469 "vmovdqu " VAR(CTR1) ", %%xmm5\n\t" \ 5470 "vpshufb %[BSWAP_EPI64], %%xmm5, %%xmm4\n\t" \ 5471 "vpaddd %[ONE], %%xmm5, %%xmm5\n\t" \ 5472 "vmovdqu %%xmm5, " VAR(CTR1) "\n\t" \ 5473 "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" \ 5474 "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5475 "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5476 "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5477 "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5478 "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5479 "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5480 "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5481 "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5482 "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5483 "cmpl $11, %[nr]\n\t" \ 5484 "vmovdqa 160(%[KEY]), %%xmm5\n\t" \ 5485 "jl %=f\n\t" \ 5486 "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ 5487 "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5488 "cmpl $13, %[nr]\n\t" \ 5489 "vmovdqa 192(%[KEY]), %%xmm5\n\t" \ 5490 "jl %=f\n\t" \ 5491 "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ 5492 "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5493 "vmovdqa 224(%[KEY]), %%xmm5\n\t" \ 5494 "%=:\n\t" \ 5495 "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t" \ 5496 "vmovdqu (%[in]," VAR(KR64) ",1), %%xmm5\n\t" \ 5497 "vpxor %%xmm5, %%xmm4, %%xmm4\n\t" \ 5498 "vmovdqu %%xmm4, (%[out]," VAR(KR64) ",1)\n\t" \ 5499 "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ 5500 "vpxor %%xmm4, " VAR(XR) ", " VAR(XR) "\n\t" 5501 5502 /* Karatsuba multiplication - slower 5503 * H01 = H[1] ^ H[0] (top and bottom 64-bits XORed) 5504 */ 5505 #define _VAESENC_GFMUL_AVX2(in, H, X, ctr1, H01) \ 5506 "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" \ 5507 "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5508 "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5509 "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5510 "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5511 "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5512 "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5513 "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5514 "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5515 "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5516 "cmpl $11, %[nr]\n\t" \ 5517 "vmovdqa 160(%[KEY]), %%xmm5\n\t" \ 5518 "jl %=f\n\t" \ 5519 "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ 5520 "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5521 "cmpl $13, %[nr]\n\t" \ 5522 "vmovdqa 192(%[KEY]), %%xmm5\n\t" \ 5523 "jl %=f\n\t" \ 5524 "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ 5525 "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5526 "vmovdqa 224(%[KEY]), %%xmm5\n\t" \ 5527 "%=:\n\t" \ 5528 "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t" \ 5529 "vmovdqu " #in ", %%xmm0\n\t" \ 5530 "vpxor %%xmm0, %%xmm4, %%xmm4\n\t" \ 5531 \ 5532 "vpsrldq $8, " #X ", %%xmm2\n\t" \ 5533 "vpxor " #X ", %%xmm2, %%xmm2\n\t" \ 5534 "vpclmulqdq $0x00, " #H ", " #X ", %%xmm5\n\t" \ 5535 "vpclmulqdq $0x11, " #H ", " #X ", %%xmm8\n\t" \ 5536 "vpclmulqdq $0x00, "#H01", %%xmm2, %%xmm7\n\t" \ 5537 "vpxor %%xmm5, %%xmm7, %%xmm7\n\t" \ 5538 "vpxor %%xmm8, %%xmm7, %%xmm7\n\t" \ 5539 "vpslldq $8, %%xmm7, %%xmm6\n\t" \ 5540 "vpsrldq $8, %%xmm7, %%xmm7\n\t" \ 5541 "vpxor %%xmm7, %%xmm8, %%xmm8\n\t" \ 5542 "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ 5543 \ 5544 "vpclmulqdq $0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \ 5545 "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ 5546 "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ 5547 "vpclmulqdq $0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \ 5548 "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ 5549 "vpxor %%xmm8, %%xmm6, %%xmm6\n\t" \ 5550 "vpxor %%xmm5, %%xmm6, " VAR(XR) "\n\t" 5551 #define VAESENC_GFMUL_AVX2(in, H, X, ctr1) \ 5552 _VAESENC_GFMUL_AVX2(in, H, X, ctr1) 5553 5554 #define _VAESENC_GFMUL_SB_AVX2(in, H, X, ctr1) \ 5555 "vpclmulqdq $0x10, " #H ", " #X ", %%xmm7\n\t" \ 5556 "vpclmulqdq $0x01, " #H ", " #X ", %%xmm6\n\t" \ 5557 "vpclmulqdq $0x00, " #H ", " #X ", %%xmm5\n\t" \ 5558 "vpclmulqdq $0x11, " #H ", " #X ", %%xmm8\n\t" \ 5559 "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" \ 5560 "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5561 "vpxor %%xmm6, %%xmm7, %%xmm7\n\t" \ 5562 "vpslldq $8, %%xmm7, %%xmm6\n\t" \ 5563 "vpsrldq $8, %%xmm7, %%xmm7\n\t" \ 5564 "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5565 "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ 5566 "vpclmulqdq $0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \ 5567 "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5568 "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5569 "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5570 "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ 5571 "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ 5572 "vpclmulqdq $0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \ 5573 "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5574 "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5575 "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5576 "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ 5577 "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5578 "vpxor %%xmm7, %%xmm8, %%xmm8\n\t" \ 5579 "vpxor %%xmm8, %%xmm6, %%xmm6\n\t" \ 5580 "cmpl $11, %[nr]\n\t" \ 5581 "vmovdqa 160(%[KEY]), %%xmm3\n\t" \ 5582 "jl %=f\n\t" \ 5583 "vaesenc %%xmm3, %%xmm4, %%xmm4\n\t" \ 5584 "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5585 "cmpl $13, %[nr]\n\t" \ 5586 "vmovdqa 192(%[KEY]), %%xmm3\n\t" \ 5587 "jl %=f\n\t" \ 5588 "vaesenc %%xmm3, %%xmm4, %%xmm4\n\t" \ 5589 "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" \ 5590 "vmovdqa 224(%[KEY]), %%xmm3\n\t" \ 5591 "%=:\n\t" \ 5592 "vaesenclast %%xmm3, %%xmm4, %%xmm4\n\t" \ 5593 "vpxor %%xmm5, %%xmm6, " VAR(XR) "\n\t" \ 5594 "vmovdqu " #in ", %%xmm5\n\t" \ 5595 "vpxor %%xmm5, %%xmm4, %%xmm4\n\t" 5596 #define VAESENC_GFMUL_SB_AVX2(in, H, X, ctr1) \ 5597 _VAESENC_GFMUL_SB_AVX2(in, H, X, ctr1) 5598 5599 5600 #define _GHASH_GFMUL_AVX2(r, r2, a, b) \ 5601 "vpclmulqdq $0x10, "#a", "#b", %%xmm2\n\t" \ 5602 "vpclmulqdq $0x01, "#a", "#b", %%xmm1\n\t" \ 5603 "vpclmulqdq $0x00, "#a", "#b", %%xmm0\n\t" \ 5604 "vpclmulqdq $0x11, "#a", "#b", %%xmm3\n\t" \ 5605 "vpxor %%xmm1, %%xmm2, %%xmm2\n\t" \ 5606 "vpslldq $8, %%xmm2, %%xmm1\n\t" \ 5607 "vpsrldq $8, %%xmm2, %%xmm2\n\t" \ 5608 "vpxor %%xmm1, %%xmm0, "#r2"\n\t" \ 5609 "vpxor %%xmm2, %%xmm3, " #r "\n\t" 5610 #define GHASH_GFMUL_AVX2(r, r2, a, b) \ 5611 _GHASH_GFMUL_AVX2(r, r2, a, b) 5612 5613 #define GHASH_MID_AVX2(r, r2) \ 5614 "vpsrld $31, "#r2", %%xmm0\n\t" \ 5615 "vpsrld $31, " #r ", %%xmm1\n\t" \ 5616 "vpslld $1, "#r2", "#r2"\n\t" \ 5617 "vpslld $1, " #r ", " #r "\n\t" \ 5618 "vpsrldq $12, %%xmm0, %%xmm2\n\t" \ 5619 "vpslldq $4, %%xmm0, %%xmm0\n\t" \ 5620 "vpslldq $4, %%xmm1, %%xmm1\n\t" \ 5621 "vpor %%xmm2, " #r ", " #r "\n\t" \ 5622 "vpor %%xmm0, "#r2", "#r2"\n\t" \ 5623 "vpor %%xmm1, " #r ", " #r "\n\t" 5624 5625 #define _GHASH_GFMUL_RED_AVX2(r, a, b) \ 5626 "vpclmulqdq $0x10, "#a", "#b", %%xmm7\n\t" \ 5627 "vpclmulqdq $0x01, "#a", "#b", %%xmm6\n\t" \ 5628 "vpclmulqdq $0x00, "#a", "#b", %%xmm5\n\t" \ 5629 "vpxor %%xmm6, %%xmm7, %%xmm7\n\t" \ 5630 "vpslldq $8, %%xmm7, %%xmm6\n\t" \ 5631 "vpsrldq $8, %%xmm7, %%xmm7\n\t" \ 5632 "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ 5633 "vpclmulqdq $0x11, "#a", "#b", %%xmm8\n\t" \ 5634 "vpclmulqdq $0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \ 5635 "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ 5636 "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ 5637 "vpclmulqdq $0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \ 5638 "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ 5639 "vpxor %%xmm7, %%xmm8, %%xmm8\n\t" \ 5640 "vpxor %%xmm8, %%xmm6, %%xmm6\n\t" \ 5641 "vpxor %%xmm5, %%xmm6, " #r "\n\t" 5642 #define GHASH_GFMUL_RED_AVX2(r, a, b) \ 5643 _GHASH_GFMUL_RED_AVX2(r, a, b) 5644 5645 #define _GHASH_GFSQR_RED2_AVX2(r, a, mod128) \ 5646 "vpclmulqdq $0x00, "#a", "#a", %%xmm6\n\t" \ 5647 "vpclmulqdq $0x11, "#a", "#a", %%xmm8\n\t" \ 5648 "vpclmulqdq $0x10, "#mod128", %%xmm6, %%xmm5\n\t" \ 5649 "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ 5650 "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ 5651 "vpclmulqdq $0x10, "#mod128", %%xmm6, %%xmm5\n\t" \ 5652 "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ 5653 "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ 5654 "vpxor %%xmm6, %%xmm8, " #r "\n\t" 5655 #define GHASH_GFSQR_RED2_AVX2(r, a, mod128) \ 5656 _GHASH_GFSQR_RED2_AVX2(r, a, mod128) 5657 5658 #define _GHASH_GFMUL_SQR_RED2_AVX2(rm, rs, a, b, mod128) \ 5659 "vpclmulqdq $0x10, "#a", "#b", %%xmm7\n\t" \ 5660 "vpclmulqdq $0x01, "#a", "#b", %%xmm6\n\t" \ 5661 "vpclmulqdq $0x00, "#a", "#b", %%xmm5\n\t" \ 5662 "vpclmulqdq $0x11, "#a", "#b", %%xmm8\n\t" \ 5663 "vpclmulqdq $0x00, "#b", "#b", %%xmm9\n\t" \ 5664 "vpclmulqdq $0x11, "#b", "#b", %%xmm10\n\t" \ 5665 "vpxor %%xmm6, %%xmm7, %%xmm7\n\t" \ 5666 "vpslldq $8, %%xmm7, %%xmm6\n\t" \ 5667 "vpsrldq $8, %%xmm7, %%xmm7\n\t" \ 5668 "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ 5669 "vpclmulqdq $0x10, "#mod128", %%xmm9, %%xmm4\n\t" \ 5670 "vpclmulqdq $0x10, "#mod128", %%xmm6, %%xmm5\n\t" \ 5671 "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ 5672 "vpshufd $0x4e, %%xmm9, %%xmm9\n\t" \ 5673 "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ 5674 "vpxor %%xmm4, %%xmm9, %%xmm9\n\t" \ 5675 "vpclmulqdq $0x10, "#mod128", %%xmm6, %%xmm5\n\t" \ 5676 "vpclmulqdq $0x10, "#mod128", %%xmm9, %%xmm4\n\t" \ 5677 "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ 5678 "vpshufd $0x4e, %%xmm9, %%xmm9\n\t" \ 5679 "vpxor %%xmm7, %%xmm8, %%xmm8\n\t" \ 5680 "vpxor %%xmm4, %%xmm9, %%xmm9\n\t" \ 5681 "vpxor %%xmm8, %%xmm6, %%xmm6\n\t" \ 5682 "vpxor %%xmm10, %%xmm9, "#rs"\n\t" \ 5683 "vpxor %%xmm5, %%xmm6, "#rm"\n\t" 5684 #define GHASH_GFMUL_SQR_RED2_AVX2(rm, rs, a, b, mod128) \ 5685 _GHASH_GFMUL_SQR_RED2_AVX2(rm, rs, a, b, mod128) 5686 5687 #define CALC_HT_8_AVX2() \ 5688 "vmovdqa %[MOD2_128], %%xmm11\n\t" \ 5689 "vmovdqa " VAR(XR) ", %%xmm2\n\t" \ 5690 "# H ^ 1 and H ^ 2\n\t" \ 5691 GHASH_GFSQR_RED2_AVX2(%%xmm0, HR, %%xmm11) \ 5692 "vmovdqu " VAR(HR) ", 0(" VAR(HTR) ")\n\t" \ 5693 "vmovdqu %%xmm0 , 16(" VAR(HTR) ")\n\t" \ 5694 "# H ^ 3 and H ^ 4\n\t" \ 5695 GHASH_GFMUL_SQR_RED2_AVX2(%%xmm1, %%xmm3, HR, %%xmm0, %%xmm11) \ 5696 "vmovdqu %%xmm1 , 32(" VAR(HTR) ")\n\t" \ 5697 "vmovdqu %%xmm3 , 48(" VAR(HTR) ")\n\t" \ 5698 "# H ^ 5 and H ^ 6\n\t" \ 5699 GHASH_GFMUL_SQR_RED2_AVX2(%%xmm12, %%xmm0, %%xmm0, %%xmm1, %%xmm11) \ 5700 "vmovdqu %%xmm12, 64(" VAR(HTR) ")\n\t" \ 5701 "vmovdqu %%xmm0 , 80(" VAR(HTR) ")\n\t" \ 5702 "# H ^ 7 and H ^ 8\n\t" \ 5703 GHASH_GFMUL_SQR_RED2_AVX2(%%xmm12, %%xmm0, %%xmm1, %%xmm3, %%xmm11) \ 5704 "vmovdqu %%xmm12, 96(" VAR(HTR) ")\n\t" \ 5705 "vmovdqu %%xmm0 , 112(" VAR(HTR) ")\n\t" 5706 5707 #define _GHASH_RED_AVX2(r, r2) \ 5708 "vmovdqa %[MOD2_128], %%xmm2\n\t" \ 5709 "vpclmulqdq $0x10, %%xmm2, "#r2", %%xmm0\n\t" \ 5710 "vpshufd $0x4e, "#r2", %%xmm1\n\t" \ 5711 "vpxor %%xmm0, %%xmm1, %%xmm1\n\t" \ 5712 "vpclmulqdq $0x10, %%xmm2, %%xmm1, %%xmm0\n\t" \ 5713 "vpshufd $0x4e, %%xmm1, %%xmm1\n\t" \ 5714 "vpxor %%xmm0, %%xmm1, %%xmm1\n\t" \ 5715 "vpxor %%xmm1, " #r ", " #r "\n\t" 5716 #define GHASH_RED_AVX2(r, r2) \ 5717 _GHASH_RED_AVX2(r, r2) 5718 5719 #define GHASH_FULL_AVX2(r, r2, a, b) \ 5720 GHASH_GFMUL_AVX2(r, r2, a, b) \ 5721 GHASH_MID_AVX2(r, r2) \ 5722 GHASH_RED_AVX2(r, r2) 5723 5724 #define _GFMUL_3V_AVX2(r, r2, r3, a, b) \ 5725 "vpclmulqdq $0x10, "#a", "#b", "#r3"\n\t" \ 5726 "vpclmulqdq $0x01, "#a", "#b", %%xmm1\n\t" \ 5727 "vpclmulqdq $0x00, "#a", "#b", "#r2"\n\t" \ 5728 "vpclmulqdq $0x11, "#a", "#b", " #r "\n\t" \ 5729 "vpxor %%xmm1, "#r3", "#r3"\n\t" 5730 #define GFMUL_3V_AVX2(r, r2, r3, a, b) \ 5731 _GFMUL_3V_AVX2(r, r2, r3, a, b) 5732 5733 #define _GFMUL_XOR_3V_AVX2(r, r2, r3, a, b) \ 5734 "vpclmulqdq $0x10, "#a", "#b", %%xmm2\n\t" \ 5735 "vpclmulqdq $0x01, "#a", "#b", %%xmm1\n\t" \ 5736 "vpclmulqdq $0x00, "#a", "#b", %%xmm0\n\t" \ 5737 "vpclmulqdq $0x11, "#a", "#b", %%xmm3\n\t" \ 5738 "vpxor %%xmm1, %%xmm2, %%xmm2\n\t" \ 5739 "vpxor %%xmm3, " #r ", " #r "\n\t" \ 5740 "vpxor %%xmm2, "#r3", "#r3"\n\t" \ 5741 "vpxor %%xmm0, "#r2", "#r2"\n\t" 5742 #define GFMUL_XOR_3V_AVX2(r, r2, r3, a, b) \ 5743 _GFMUL_XOR_3V_AVX2(r, r2, r3, a, b) 5744 5745 #define GHASH_GFMUL_RED_8_AVX2() \ 5746 "vmovdqu (" VAR(HTR) "), %%xmm12\n\t" \ 5747 GFMUL_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm11, %%xmm12) \ 5748 "vmovdqu 16(" VAR(HTR) "), %%xmm12\n\t" \ 5749 GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm10, %%xmm12) \ 5750 "vmovdqu 32(" VAR(HTR) "), %%xmm11\n\t" \ 5751 "vmovdqu 48(" VAR(HTR) "), %%xmm12\n\t" \ 5752 GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm9, %%xmm11) \ 5753 GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm8, %%xmm12) \ 5754 "vmovdqu 64(" VAR(HTR) "), %%xmm11\n\t" \ 5755 "vmovdqu 80(" VAR(HTR) "), %%xmm12\n\t" \ 5756 GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm7, %%xmm11) \ 5757 GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm6, %%xmm12) \ 5758 "vmovdqu 96(" VAR(HTR) "), %%xmm11\n\t" \ 5759 "vmovdqu 112(" VAR(HTR) "), %%xmm12\n\t" \ 5760 GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm5, %%xmm11) \ 5761 GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm4, %%xmm12) \ 5762 "vpslldq $8, %%xmm14, %%xmm12\n\t" \ 5763 "vpsrldq $8, %%xmm14, %%xmm14\n\t" \ 5764 "vpxor %%xmm12, %%xmm13, %%xmm13\n\t" \ 5765 "vpxor %%xmm14, " VAR(XR) ", " VAR(XR) "\n\t" \ 5766 GHASH_RED_AVX2(XR, %%xmm13) 5767 5768 #define CALC_IV_12_AVX2() \ 5769 "# Calculate values when IV is 12 bytes\n\t" \ 5770 "# Set counter based on IV\n\t" \ 5771 "movl $0x01000000, %%ecx\n\t" \ 5772 "vpinsrq $0, 0(%%rax), %%xmm13, %%xmm13\n\t" \ 5773 "vpinsrd $2, 8(%%rax), %%xmm13, %%xmm13\n\t" \ 5774 "vpinsrd $3, %%ecx, %%xmm13, %%xmm13\n\t" \ 5775 "# H = Encrypt X(=0) and T = Encrypt counter\n\t" \ 5776 "vmovdqa 0(%[KEY]), " VAR(HR) "\n\t" \ 5777 "vmovdqa 16(%[KEY]), %%xmm12\n\t" \ 5778 "vpxor " VAR(HR) ", %%xmm13, %%xmm1\n\t" \ 5779 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 5780 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 5781 "vmovdqa 32(%[KEY]), %%xmm0\n\t" \ 5782 "vmovdqa 48(%[KEY]), %%xmm12\n\t" \ 5783 "vaesenc %%xmm0, " VAR(HR) ", " VAR(HR) "\n\t" \ 5784 "vaesenc %%xmm0, %%xmm1, %%xmm1\n\t" \ 5785 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 5786 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 5787 "vmovdqa 64(%[KEY]), %%xmm0\n\t" \ 5788 "vmovdqa 80(%[KEY]), %%xmm12\n\t" \ 5789 "vaesenc %%xmm0, " VAR(HR) ", " VAR(HR) "\n\t" \ 5790 "vaesenc %%xmm0, %%xmm1, %%xmm1\n\t" \ 5791 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 5792 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 5793 "vmovdqa 96(%[KEY]), %%xmm0\n\t" \ 5794 "vmovdqa 112(%[KEY]), %%xmm12\n\t" \ 5795 "vaesenc %%xmm0, " VAR(HR) ", " VAR(HR) "\n\t" \ 5796 "vaesenc %%xmm0, %%xmm1, %%xmm1\n\t" \ 5797 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 5798 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 5799 "vmovdqa 128(%[KEY]), %%xmm0\n\t" \ 5800 "vmovdqa 144(%[KEY]), %%xmm12\n\t" \ 5801 "vaesenc %%xmm0, " VAR(HR) ", " VAR(HR) "\n\t" \ 5802 "vaesenc %%xmm0, %%xmm1, %%xmm1\n\t" \ 5803 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 5804 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 5805 "cmpl $11, %[nr]\n\t" \ 5806 "vmovdqa 160(%[KEY]), %%xmm0\n\t" \ 5807 "jl 31f\n\t" \ 5808 "vmovdqa 176(%[KEY]), %%xmm12\n\t" \ 5809 "vaesenc %%xmm0, " VAR(HR) ", " VAR(HR) "\n\t" \ 5810 "vaesenc %%xmm0, %%xmm1, %%xmm1\n\t" \ 5811 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 5812 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 5813 "cmpl $13, %[nr]\n\t" \ 5814 "vmovdqa 192(%[KEY]), %%xmm0\n\t" \ 5815 "jl 31f\n\t" \ 5816 "vmovdqa 208(%[KEY]), %%xmm12\n\t" \ 5817 "vaesenc %%xmm0, " VAR(HR) ", " VAR(HR) "\n\t" \ 5818 "vaesenc %%xmm0, %%xmm1, %%xmm1\n\t" \ 5819 "vaesenc %%xmm12, " VAR(HR) ", " VAR(HR) "\n\t" \ 5820 "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ 5821 "vmovdqu 224(%[KEY]), %%xmm0\n\t" \ 5822 "31:\n\t" \ 5823 "vaesenclast %%xmm0, " VAR(HR) ", " VAR(HR) "\n\t" \ 5824 "vaesenclast %%xmm0, %%xmm1, %%xmm1\n\t" \ 5825 "vpshufb %[BSWAP_MASK], " VAR(HR) ", " VAR(HR) "\n\t" \ 5826 "vmovdqu %%xmm1, " VAR(TR) "\n\t" \ 5827 5828 #define CALC_IV_AVX2() \ 5829 "# Calculate values when IV is not 12 bytes\n\t" \ 5830 "# H = Encrypt X(=0)\n\t" \ 5831 "vmovdqa 0(%[KEY]), " VAR(HR) "\n\t" \ 5832 VAESENC_AVX(HR) \ 5833 "vpshufb %[BSWAP_MASK], " VAR(HR) ", " VAR(HR) "\n\t" \ 5834 "# Calc counter\n\t" \ 5835 "# Initialization vector\n\t" \ 5836 "cmpl $0, %%edx\n\t" \ 5837 "movq $0, %%rcx\n\t" \ 5838 "je 45f\n\t" \ 5839 "cmpl $16, %%edx\n\t" \ 5840 "jl 44f\n\t" \ 5841 "andl $0xfffffff0, %%edx\n\t" \ 5842 "\n" \ 5843 "43:\n\t" \ 5844 "vmovdqu (%%rax,%%rcx,1), %%xmm4\n\t" \ 5845 "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ 5846 "vpxor %%xmm4, %%xmm13, %%xmm13\n\t" \ 5847 GHASH_FULL_AVX2(%%xmm13, %%xmm12, %%xmm13, HR) \ 5848 "addl $16, %%ecx\n\t" \ 5849 "cmpl %%edx, %%ecx\n\t" \ 5850 "jl 43b\n\t" \ 5851 "movl %[ibytes], %%edx\n\t" \ 5852 "cmpl %%edx, %%ecx\n\t" \ 5853 "je 45f\n\t" \ 5854 "\n" \ 5855 "44:\n\t" \ 5856 "subq $16, %%rsp\n\t" \ 5857 "vpxor %%xmm4, %%xmm4, %%xmm4\n\t" \ 5858 "xorl %%ebx, %%ebx\n\t" \ 5859 "vmovdqu %%xmm4, (%%rsp)\n\t" \ 5860 "42:\n\t" \ 5861 "movzbl (%%rax,%%rcx,1), %%r13d\n\t" \ 5862 "movb %%r13b, (%%rsp,%%rbx,1)\n\t" \ 5863 "incl %%ecx\n\t" \ 5864 "incl %%ebx\n\t" \ 5865 "cmpl %%edx, %%ecx\n\t" \ 5866 "jl 42b\n\t" \ 5867 "vmovdqu (%%rsp), %%xmm4\n\t" \ 5868 "addq $16, %%rsp\n\t" \ 5869 "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ 5870 "vpxor %%xmm4, %%xmm13, %%xmm13\n\t" \ 5871 GHASH_FULL_AVX2(%%xmm13, %%xmm12, %%xmm13, HR) \ 5872 "\n" \ 5873 "45:\n\t" \ 5874 "# T = Encrypt counter\n\t" \ 5875 "vpxor %%xmm0, %%xmm0, %%xmm0\n\t" \ 5876 "shll $3, %%edx\n\t" \ 5877 "vpinsrq $0, %%rdx, %%xmm0, %%xmm0\n\t" \ 5878 "vpxor %%xmm0, %%xmm13, %%xmm13\n\t" \ 5879 GHASH_FULL_AVX2(%%xmm13, %%xmm12, %%xmm13, HR) \ 5880 "vpshufb %[BSWAP_MASK], %%xmm13, %%xmm13\n\t" \ 5881 "# Encrypt counter\n\t" \ 5882 "vmovdqa 0(%[KEY]), %%xmm4\n\t" \ 5883 "vpxor %%xmm13, %%xmm4, %%xmm4\n\t" \ 5884 VAESENC_AVX(%%xmm4) \ 5885 "vmovdqu %%xmm4, " VAR(TR) "\n\t" 5886 5887 #define CALC_AAD_AVX2() \ 5888 "# Additional authentication data\n\t" \ 5889 "movl %[abytes], %%edx\n\t" \ 5890 "cmpl $0, %%edx\n\t" \ 5891 "je 25f\n\t" \ 5892 "movq %[addt], %%rax\n\t" \ 5893 "xorl %%ecx, %%ecx\n\t" \ 5894 "cmpl $16, %%edx\n\t" \ 5895 "jl 24f\n\t" \ 5896 "andl $0xfffffff0, %%edx\n\t" \ 5897 "\n" \ 5898 "23:\n\t" \ 5899 "vmovdqu (%%rax,%%rcx,1), %%xmm4\n\t" \ 5900 "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ 5901 "vpxor %%xmm4, " VAR(XR) ", " VAR(XR) "\n\t" \ 5902 GHASH_FULL_AVX2(XR, %%xmm12, XR, HR) \ 5903 "addl $16, %%ecx\n\t" \ 5904 "cmpl %%edx, %%ecx\n\t" \ 5905 "jl 23b\n\t" \ 5906 "movl %[abytes], %%edx\n\t" \ 5907 "cmpl %%edx, %%ecx\n\t" \ 5908 "je 25f\n\t" \ 5909 "\n" \ 5910 "24:\n\t" \ 5911 "subq $16, %%rsp\n\t" \ 5912 "vpxor %%xmm4, %%xmm4, %%xmm4\n\t" \ 5913 "xorl %%ebx, %%ebx\n\t" \ 5914 "vmovdqu %%xmm4, (%%rsp)\n\t" \ 5915 "22:\n\t" \ 5916 "movzbl (%%rax,%%rcx,1), %%r13d\n\t" \ 5917 "movb %%r13b, (%%rsp,%%rbx,1)\n\t" \ 5918 "incl %%ecx\n\t" \ 5919 "incl %%ebx\n\t" \ 5920 "cmpl %%edx, %%ecx\n\t" \ 5921 "jl 22b\n\t" \ 5922 "vmovdqu (%%rsp), %%xmm4\n\t" \ 5923 "addq $16, %%rsp\n\t" \ 5924 "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ 5925 "vpxor %%xmm4, " VAR(XR) ", " VAR(XR) "\n\t" \ 5926 GHASH_FULL_AVX2(XR, %%xmm12, XR, HR) \ 5927 "\n" \ 5928 "25:\n\t" 5929 5930 #define VAESENC_128_GHASH_AVX2(src, o) \ 5931 "leaq (%[in]," VAR(KR64) ",1), %%rcx\n\t" \ 5932 "leaq (%[out]," VAR(KR64) ",1), %%rdx\n\t" \ 5933 /* src is either %%rcx or %%rdx */ \ 5934 VAESENC_CTR() \ 5935 VAESENC_XOR() \ 5936 VAESENC_PCLMUL_AVX2_1(src, 16, (o-128), 112) \ 5937 VAESENC_PCLMUL_AVX2_2(src, 32, (o-112), 96) \ 5938 VAESENC_PCLMUL_AVX2_N(src, 48, (o- 96), 80) \ 5939 VAESENC_PCLMUL_AVX2_N(src, 64, (o- 80), 64) \ 5940 VAESENC_PCLMUL_AVX2_N(src, 80, (o- 64), 48) \ 5941 VAESENC_PCLMUL_AVX2_N(src, 96, (o- 48), 32) \ 5942 VAESENC_PCLMUL_AVX2_N(src, 112, (o- 32), 16) \ 5943 VAESENC_PCLMUL_AVX2_N(src, 128, (o- 16), 0) \ 5944 VAESENC_PCLMUL_AVX2_L(144) \ 5945 "cmpl $11, %[nr]\n\t" \ 5946 "vmovdqa 160(%[KEY]), %%xmm12\n\t" \ 5947 "jl 4f\n\t" \ 5948 VAESENC() \ 5949 VAESENC_SET(176) \ 5950 "cmpl $13, %[nr]\n\t" \ 5951 "vmovdqa 192(%[KEY]), %%xmm12\n\t" \ 5952 "jl 4f\n\t" \ 5953 VAESENC() \ 5954 VAESENC_SET(208) \ 5955 "vmovdqa 224(%[KEY]), %%xmm12\n\t" \ 5956 "\n" \ 5957 "4:\n\t" \ 5958 VAESENC_LAST(%%rcx, %%rdx) 5959 5960 #define AESENC_LAST15_ENC_AVX2() \ 5961 "movl %[nbytes], %%ecx\n\t" \ 5962 "movl %%ecx, %%edx\n\t" \ 5963 "andl $0x0f, %%ecx\n\t" \ 5964 "jz 55f\n\t" \ 5965 "vmovdqu " VAR(CTR1) ", %%xmm13\n\t" \ 5966 "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" \ 5967 "vpxor 0(%[KEY]), %%xmm13, %%xmm13\n\t" \ 5968 VAESENC_AVX(%%xmm13) \ 5969 "subq $16, %%rsp\n\t" \ 5970 "xorl %%ecx, %%ecx\n\t" \ 5971 "vmovdqu %%xmm13, (%%rsp)\n\t" \ 5972 "\n" \ 5973 "51:\n\t" \ 5974 "movzbl (%[in]," VAR(KR64) ",1), %%r13d\n\t" \ 5975 "xorb (%%rsp,%%rcx,1), %%r13b\n\t" \ 5976 "movb %%r13b, (%[out]," VAR(KR64) ",1)\n\t" \ 5977 "movb %%r13b, (%%rsp,%%rcx,1)\n\t" \ 5978 "incl " VAR(KR) "\n\t" \ 5979 "incl %%ecx\n\t" \ 5980 "cmpl %%edx, " VAR(KR) "\n\t" \ 5981 "jl 51b\n\t" \ 5982 "xorq %%r13, %%r13\n\t" \ 5983 "cmpl $16, %%ecx\n\t" \ 5984 "je 53f\n\t" \ 5985 "\n" \ 5986 "52:\n\t" \ 5987 "movb %%r13b, (%%rsp,%%rcx,1)\n\t" \ 5988 "incl %%ecx\n\t" \ 5989 "cmpl $16, %%ecx\n\t" \ 5990 "jl 52b\n\t" \ 5991 "53:\n\t" \ 5992 "vmovdqu (%%rsp), %%xmm13\n\t" \ 5993 "addq $16, %%rsp\n\t" \ 5994 "vpshufb %[BSWAP_MASK], %%xmm13, %%xmm13\n\t" \ 5995 "vpxor %%xmm13, " VAR(XR) ", " VAR(XR) "\n\t" \ 5996 GHASH_GFMUL_RED_AVX2(XR, HR, XR) \ 5997 5998 #define AESENC_LAST15_DEC_AVX2() \ 5999 "movl %[nbytes], %%ecx\n\t" \ 6000 "movl %%ecx, %%edx\n\t" \ 6001 "andl $0x0f, %%ecx\n\t" \ 6002 "jz 55f\n\t" \ 6003 "vmovdqu " VAR(CTR1) ", %%xmm13\n\t" \ 6004 "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" \ 6005 "vpxor 0(%[KEY]), %%xmm13, %%xmm13\n\t" \ 6006 VAESENC_AVX(%%xmm13) \ 6007 "subq $32, %%rsp\n\t" \ 6008 "xorl %%ecx, %%ecx\n\t" \ 6009 "vmovdqu %%xmm13, (%%rsp)\n\t" \ 6010 "vpxor %%xmm0, %%xmm0, %%xmm0\n\t" \ 6011 "vmovdqu %%xmm0, 16(%%rsp)\n\t" \ 6012 "\n" \ 6013 "51:\n\t" \ 6014 "movzbl (%[in]," VAR(KR64) ",1), %%r13d\n\t" \ 6015 "movb %%r13b, 16(%%rsp,%%rcx,1)\n\t" \ 6016 "xorb (%%rsp,%%rcx,1), %%r13b\n\t" \ 6017 "movb %%r13b, (%[out]," VAR(KR64) ",1)\n\t" \ 6018 "incl " VAR(KR) "\n\t" \ 6019 "incl %%ecx\n\t" \ 6020 "cmpl %%edx, " VAR(KR) "\n\t" \ 6021 "jl 51b\n\t" \ 6022 "53:\n\t" \ 6023 "vmovdqu 16(%%rsp), %%xmm13\n\t" \ 6024 "addq $32, %%rsp\n\t" \ 6025 "vpshufb %[BSWAP_MASK], %%xmm13, %%xmm13\n\t" \ 6026 "vpxor %%xmm13, " VAR(XR) ", " VAR(XR) "\n\t" \ 6027 GHASH_GFMUL_RED_AVX2(XR, HR, XR) \ 6028 6029 #define CALC_TAG_AVX2() \ 6030 "movl %[nbytes], %%edx\n\t" \ 6031 "movl %[abytes], %%ecx\n\t" \ 6032 "shlq $3, %%rdx\n\t" \ 6033 "shlq $3, %%rcx\n\t" \ 6034 "vpinsrq $0, %%rdx, %%xmm0, %%xmm0\n\t" \ 6035 "vpinsrq $1, %%rcx, %%xmm0, %%xmm0\n\t" \ 6036 "vpxor %%xmm0, " VAR(XR) ", " VAR(XR) "\n\t" \ 6037 GHASH_GFMUL_RED_AVX2(XR, HR, XR) \ 6038 "vpshufb %[BSWAP_MASK], " VAR(XR) ", " VAR(XR) "\n\t" \ 6039 "vpxor " VAR(TR) ", " VAR(XR) ", %%xmm0\n\t" \ 6040 6041 6042 static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, 6043 const unsigned char* addt, 6044 const unsigned char* ivec, unsigned char *tag, 6045 unsigned int nbytes, unsigned int abytes, 6046 unsigned int ibytes, unsigned int tbytes, 6047 const unsigned char* key, int nr) 6048 { 6049 register const unsigned char* iv asm("rax") = ivec; 6050 register unsigned int ivLen asm("ebx") = ibytes; 6051 6052 __asm__ __volatile__ ( 6053 "subq $" VAR(STACK_OFFSET) ", %%rsp\n\t" 6054 /* Counter is xmm13 */ 6055 "vpxor %%xmm13, %%xmm13, %%xmm13\n\t" 6056 "vpxor " VAR(XR) ", " VAR(XR) ", " VAR(XR) "\n\t" 6057 "movl %[ibytes], %%edx\n\t" 6058 "cmpl $12, %%edx\n\t" 6059 "jne 35f\n\t" 6060 CALC_IV_12_AVX2() 6061 "jmp 39f\n\t" 6062 "\n" 6063 "35:\n\t" 6064 CALC_IV_AVX2() 6065 "\n" 6066 "39:\n\t" 6067 6068 CALC_AAD_AVX2() 6069 6070 "# Calculate counter and H\n\t" 6071 "vpsrlq $63, " VAR(HR) ", %%xmm5\n\t" 6072 "vpsllq $1, " VAR(HR) ", %%xmm4\n\t" 6073 "vpslldq $8, %%xmm5, %%xmm5\n\t" 6074 "vpor %%xmm5, %%xmm4, %%xmm4\n\t" 6075 "vpshufd $0xff, " VAR(HR) ", " VAR(HR) "\n\t" 6076 "vpsrad $31, " VAR(HR) ", " VAR(HR) "\n\t" 6077 "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" 6078 "vpand %[MOD2_128], " VAR(HR) ", " VAR(HR) "\n\t" 6079 "vpaddd %[ONE], %%xmm13, %%xmm13\n\t" 6080 "vpxor %%xmm4, " VAR(HR) ", " VAR(HR) "\n\t" 6081 "vmovdqu %%xmm13, " VAR(CTR1) "\n\t" 6082 6083 "xorl " VAR(KR) ", " VAR(KR) "\n\t" 6084 6085 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL) 6086 "cmpl $128, %[nbytes]\n\t" 6087 "movl %[nbytes], %%r13d\n\t" 6088 "jl 5f\n\t" 6089 "andl $0xffffff80, %%r13d\n\t" 6090 6091 CALC_HT_8_AVX2() 6092 6093 "# First 128 bytes of input\n\t" 6094 VAESENC_128() 6095 6096 "cmpl $128, %%r13d\n\t" 6097 "movl $128, " VAR(KR) "\n\t" 6098 "jle 2f\n\t" 6099 6100 "# More 128 bytes of input\n\t" 6101 "\n" 6102 "3:\n\t" 6103 VAESENC_128_GHASH_AVX2(%%rdx, 0) 6104 "addl $128, " VAR(KR) "\n\t" 6105 "cmpl %%r13d, " VAR(KR) "\n\t" 6106 "jl 3b\n\t" 6107 "\n" 6108 "2:\n\t" 6109 "vmovdqa %[BSWAP_MASK], %%xmm13\n\t" 6110 "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t" 6111 "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t" 6112 "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t" 6113 "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t" 6114 "vpshufb %%xmm13, %%xmm8, %%xmm8\n\t" 6115 "vpshufb %%xmm13, %%xmm9, %%xmm9\n\t" 6116 "vpshufb %%xmm13, %%xmm10, %%xmm10\n\t" 6117 "vpshufb %%xmm13, %%xmm11, %%xmm11\n\t" 6118 "vpxor %%xmm2, %%xmm4, %%xmm4\n\t" 6119 6120 GHASH_GFMUL_RED_8_AVX2() 6121 6122 "vmovdqu 0(" VAR(HTR) "), " VAR(HR) "\n\t" 6123 "\n" 6124 "5:\n\t" 6125 "movl %[nbytes], %%edx\n\t" 6126 "cmpl %%edx, " VAR(KR) "\n\t" 6127 "jge 55f\n\t" 6128 #endif 6129 6130 "movl %[nbytes], %%r13d\n\t" 6131 "andl $0xfffffff0, %%r13d\n\t" 6132 "cmpl %%r13d, " VAR(KR) "\n\t" 6133 "jge 14f\n\t" 6134 6135 VAESENC_BLOCK_AVX2() 6136 "addl $16, " VAR(KR) "\n\t" 6137 "cmpl %%r13d, " VAR(KR) "\n\t" 6138 "jge 13f\n\t" 6139 "vmovdqa %[MOD2_128], %%xmm0\n\t" 6140 "\n" 6141 "12:\n\t" 6142 "vmovdqu (%[in]," VAR(KR64) ",1), %%xmm9\n\t" 6143 "vmovdqu " VAR(CTR1) ", %%xmm5\n\t" 6144 "vpshufb %[BSWAP_EPI64], %%xmm5, %%xmm4\n\t" 6145 "vpaddd %[ONE], %%xmm5, %%xmm5\n\t" 6146 "vmovdqu %%xmm5, " VAR(CTR1) "\n\t" 6147 VAESENC_GFMUL_SB_AVX2(%%xmm9, HR, XR, CTR1) 6148 "vmovdqu %%xmm4, (%[out]," VAR(KR64) ",1)\n\t" 6149 "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" 6150 "addl $16, " VAR(KR) "\n\t" 6151 "vpxor %%xmm4, " VAR(XR) ", " VAR(XR) "\n\t" 6152 "cmpl %%r13d, " VAR(KR) "\n\t" 6153 "jl 12b\n\t" 6154 "\n" 6155 "13:\n\t" 6156 GHASH_GFMUL_RED_AVX2(XR, HR, XR) 6157 "\n" 6158 "14:\n\t" 6159 6160 AESENC_LAST15_ENC_AVX2() 6161 "\n" 6162 "55:\n\t" 6163 6164 CALC_TAG_AVX2() 6165 STORE_TAG_AVX() 6166 "addq $" VAR(STACK_OFFSET) ", %%rsp\n\t" 6167 "vzeroupper\n\t" 6168 6169 : 6170 : [KEY] "r" (key), 6171 [in] "r" (in), [out] "r" (out), [nr] "r" (nr), 6172 [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt), 6173 [ivec] "r" (iv), [ibytes] "r" (ivLen), [tbytes] "r" (tbytes), 6174 [tag] "r" (tag), 6175 [BSWAP_MASK] "m" (BSWAP_MASK), 6176 [BSWAP_EPI64] "m" (BSWAP_EPI64), 6177 [ONE] "m" (ONE), 6178 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL) 6179 [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR), 6180 [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN), 6181 [EIGHT] "m" (EIGHT), 6182 #endif 6183 [MOD2_128] "m" (MOD2_128) 6184 : "xmm15", "xmm14", "xmm13", "xmm12", 6185 "xmm0", "xmm1", "xmm2", "xmm3", "memory", 6186 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", 6187 "rcx", "rdx", "r13" 6188 ); 6189 } 6190 #endif /* HAVE_INTEL_AVX2 */ 6191 #endif /* HAVE_INTEL_AVX1 */ 6192 6193 #ifdef HAVE_AES_DECRYPT 6194 /* Figure 10. AES-GCM – Decrypt With Single Block Ghash at a Time */ 6195 6196 static void AES_GCM_decrypt(const unsigned char *in, unsigned char *out, 6197 const unsigned char* addt, 6198 const unsigned char* ivec, const unsigned char *tag, 6199 int nbytes, int abytes, int ibytes, int tbytes, 6200 const unsigned char* key, int nr, int* res) 6201 { 6202 register const unsigned char* iv asm("rax") = ivec; 6203 register int ivLen asm("ebx") = ibytes; 6204 register int tagLen asm("edx") = tbytes; 6205 6206 __asm__ __volatile__ ( 6207 "pushq %%rdx\n\t" 6208 "subq $" VAR(STACK_OFFSET) ", %%rsp\n\t" 6209 /* Counter is xmm13 */ 6210 "pxor %%xmm13, %%xmm13\n\t" 6211 "pxor %%xmm15, %%xmm15\n\t" 6212 "movl %[ibytes], %%edx\n\t" 6213 "cmpl $12, %%edx\n\t" 6214 "jne 35f\n\t" 6215 CALC_IV_12() 6216 "\n" 6217 "35:\n\t" 6218 CALC_IV() 6219 "\n" 6220 "39:\n\t" 6221 6222 CALC_AAD() 6223 6224 "# Calculate counter and H\n\t" 6225 "pshufb %[BSWAP_EPI64], %%xmm13\n\t" 6226 "movdqa " VAR(HR) ", %%xmm5\n\t" 6227 "paddd %[ONE], %%xmm13\n\t" 6228 "movdqa " VAR(HR) ", %%xmm4\n\t" 6229 "movdqu %%xmm13, " VAR(CTR1) "\n\t" 6230 "psrlq $63, %%xmm5\n\t" 6231 "psllq $1, %%xmm4\n\t" 6232 "pslldq $8, %%xmm5\n\t" 6233 "por %%xmm5, %%xmm4\n\t" 6234 "pshufd $0xff, " VAR(HR) ", " VAR(HR) "\n\t" 6235 "psrad $31, " VAR(HR) "\n\t" 6236 "pand %[MOD2_128], " VAR(HR) "\n\t" 6237 "pxor %%xmm4, " VAR(HR) "\n\t" 6238 6239 "xorl " VAR(KR) ", " VAR(KR) "\n\t" 6240 6241 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) 6242 "cmpl $128, %[nbytes]\n\t" 6243 "jl 5f\n\t" 6244 6245 CALC_HT_8_AVX() 6246 6247 "movl %[nbytes], %%r13d\n\t" 6248 "andl $0xffffff80, %%r13d\n\t" 6249 "\n" 6250 "2:\n\t" 6251 AESENC_128_GHASH_AVX(%%rcx, 128) 6252 "addl $128, " VAR(KR) "\n\t" 6253 "cmpl %%r13d, " VAR(KR) "\n\t" 6254 "jl 2b\n\t" 6255 6256 "movdqa %%xmm2, " VAR(XR) "\n\t" 6257 "movdqu (%%rsp), " VAR(HR) "\n\t" 6258 "5:\n\t" 6259 "movl %[nbytes], %%edx\n\t" 6260 "cmpl %%edx, " VAR(KR) "\n\t" 6261 "jge 55f\n\t" 6262 #endif 6263 "movl %[nbytes], %%r13d\n\t" 6264 "andl $0xfffffff0, %%r13d\n\t" 6265 "cmpl %%r13d, " VAR(KR) "\n\t" 6266 "jge 13f\n\t" 6267 6268 "\n" 6269 "12:\n\t" 6270 "leaq (%[in]," VAR(KR64) ",1), %%rcx\n\t" 6271 "leaq (%[out]," VAR(KR64) ",1), %%rdx\n\t" 6272 "movdqu (%%rcx), %%xmm1\n\t" 6273 "movdqa " VAR(HR) ", %%xmm0\n\t" 6274 "pshufb %[BSWAP_MASK], %%xmm1\n\t" 6275 "pxor " VAR(XR) ", %%xmm1\n\t" 6276 AESENC_GFMUL(%%rcx, %%rdx, %%xmm0, %%xmm1) 6277 "addl $16, " VAR(KR) "\n\t" 6278 "cmpl %%r13d, " VAR(KR) "\n\t" 6279 "jl 12b\n\t" 6280 "\n" 6281 "13:\n\t" 6282 6283 AESENC_LAST15_DEC_AVX() 6284 "\n" 6285 "55:\n\t" 6286 6287 CALC_TAG() 6288 "addq $" VAR(STACK_OFFSET) ", %%rsp\n\t" 6289 "popq %%rdx\n\t" 6290 CMP_TAG() 6291 6292 : 6293 : [KEY] "r" (key), 6294 [in] "r" (in), [out] "r" (out), [nr] "r" (nr), 6295 [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt), 6296 [ivec] "r" (iv), [ibytes] "r" (ivLen), [tbytes] "r" (tagLen), 6297 [tag] "r" (tag), [res] "r" (res), 6298 [BSWAP_MASK] "m" (BSWAP_MASK), 6299 [BSWAP_EPI64] "m" (BSWAP_EPI64), 6300 [ONE] "m" (ONE), 6301 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) 6302 [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR), 6303 [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN), 6304 [EIGHT] "m" (EIGHT), 6305 #endif 6306 [MOD2_128] "m" (MOD2_128) 6307 : "xmm15", "xmm14", "xmm13", "xmm12", 6308 "xmm0", "xmm1", "xmm2", "xmm3", "memory", 6309 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", 6310 "rcx", "r13" 6311 ); 6312 } 6313 6314 #ifdef HAVE_INTEL_AVX1 6315 static void AES_GCM_decrypt_avx1(const unsigned char *in, unsigned char *out, 6316 const unsigned char* addt, 6317 const unsigned char* ivec, 6318 const unsigned char *tag, int nbytes, 6319 int abytes, int ibytes, int tbytes, 6320 const unsigned char* key, int nr, int* res) 6321 { 6322 register const unsigned char* iv asm("rax") = ivec; 6323 register int ivLen asm("ebx") = ibytes; 6324 register int tagLen asm("edx") = tbytes; 6325 6326 __asm__ __volatile__ ( 6327 "pushq %%rdx\n\t" 6328 "subq $" VAR(STACK_OFFSET) ", %%rsp\n\t" 6329 /* Counter is xmm13 */ 6330 "vpxor %%xmm13, %%xmm13, %%xmm13\n\t" 6331 "vpxor %%xmm15, %%xmm15, %%xmm15\n\t" 6332 "movl %[ibytes], %%edx\n\t" 6333 "cmpl $12, %%edx\n\t" 6334 "jne 35f\n\t" 6335 CALC_IV_12_AVX1() 6336 "\n" 6337 "35:\n\t" 6338 CALC_IV_AVX1() 6339 "\n" 6340 "39:\n\t" 6341 6342 CALC_AAD_AVX1() 6343 6344 "# Calculate counter and H\n\t" 6345 "vpsrlq $63, " VAR(HR) ", %%xmm5\n\t" 6346 "vpsllq $1, " VAR(HR) ", %%xmm4\n\t" 6347 "vpslldq $8, %%xmm5, %%xmm5\n\t" 6348 "vpor %%xmm5, %%xmm4, %%xmm4\n\t" 6349 "vpshufd $0xff, " VAR(HR) ", " VAR(HR) "\n\t" 6350 "vpsrad $31, " VAR(HR) ", " VAR(HR) "\n\t" 6351 "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" 6352 "vpand %[MOD2_128], " VAR(HR) ", " VAR(HR) "\n\t" 6353 "vpaddd %[ONE], %%xmm13, %%xmm13\n\t" 6354 "vpxor %%xmm4, " VAR(HR) ", " VAR(HR) "\n\t" 6355 "vmovdqu %%xmm13, " VAR(CTR1) "\n\t" 6356 6357 "xorl " VAR(KR) ", " VAR(KR) "\n\t" 6358 6359 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) 6360 "cmpl $128, %[nbytes]\n\t" 6361 "jl 5f\n\t" 6362 6363 CALC_HT_8_AVX1() 6364 6365 "movl %[nbytes], %%r13d\n\t" 6366 "andl $0xffffff80, %%r13d\n\t" 6367 "\n" 6368 "2:\n\t" 6369 VAESENC_128_GHASH_AVX1(%%rcx, 128) 6370 "addl $128, " VAR(KR) "\n\t" 6371 "cmpl %%r13d, " VAR(KR) "\n\t" 6372 "jl 2b\n\t" 6373 6374 "vmovdqa %%xmm2, " VAR(XR) "\n\t" 6375 "vmovdqu (%%rsp), " VAR(HR) "\n\t" 6376 "5:\n\t" 6377 "movl %[nbytes], %%edx\n\t" 6378 "cmpl %%edx, " VAR(KR) "\n\t" 6379 "jge 55f\n\t" 6380 #endif 6381 "movl %[nbytes], %%r13d\n\t" 6382 "andl $0xfffffff0, %%r13d\n\t" 6383 "cmpl %%r13d, " VAR(KR) "\n\t" 6384 "jge 13f\n\t" 6385 6386 "\n" 6387 "12:\n\t" 6388 "vmovdqu (%[in]," VAR(KR64) ",1), %%xmm9\n\t" 6389 "vmovdqa " VAR(HR) ", %%xmm0\n\t" 6390 "vpshufb %[BSWAP_MASK], %%xmm9, %%xmm1\n\t" 6391 "vpxor " VAR(XR) ", %%xmm1, %%xmm1\n\t" 6392 VAESENC_GFMUL(%%xmm9, %%xmm0, %%xmm1) 6393 "addl $16, " VAR(KR) "\n\t" 6394 "cmpl %%r13d, " VAR(KR) "\n\t" 6395 "jl 12b\n\t" 6396 "\n" 6397 "13:\n\t" 6398 6399 AESENC_LAST15_DEC_AVX1() 6400 "\n" 6401 "55:\n\t" 6402 6403 CALC_TAG_AVX1() 6404 "addq $" VAR(STACK_OFFSET) ", %%rsp\n\t" 6405 "popq %%rdx\n\t" 6406 CMP_TAG_AVX() 6407 "vzeroupper\n\t" 6408 6409 : 6410 : [KEY] "r" (key), 6411 [in] "r" (in), [out] "r" (out), [nr] "r" (nr), 6412 [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt), 6413 [ivec] "r" (iv), [ibytes] "r" (ivLen), [tbytes] "r" (tagLen), 6414 [tag] "r" (tag), [res] "r" (res), 6415 [BSWAP_MASK] "m" (BSWAP_MASK), 6416 [BSWAP_EPI64] "m" (BSWAP_EPI64), 6417 [ONE] "m" (ONE), 6418 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) 6419 [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR), 6420 [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN), 6421 [EIGHT] "m" (EIGHT), 6422 #endif 6423 [MOD2_128] "m" (MOD2_128) 6424 : "xmm15", "xmm14", "xmm13", "xmm12", 6425 "xmm0", "xmm1", "xmm2", "xmm3", "memory", 6426 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", 6427 "rcx", "r13" 6428 ); 6429 } 6430 6431 #ifdef HAVE_INTEL_AVX2 6432 static void AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, 6433 const unsigned char* addt, 6434 const unsigned char* ivec, 6435 const unsigned char *tag, int nbytes, 6436 int abytes, int ibytes, int tbytes, 6437 const unsigned char* key, int nr, int* res) 6438 { 6439 register const unsigned char* iv asm("rax") = ivec; 6440 register int ivLen asm("ebx") = ibytes; 6441 register int tagLen asm("edx") = tbytes; 6442 6443 __asm__ __volatile__ ( 6444 "pushq %%rdx\n\t" 6445 "subq $" VAR(STACK_OFFSET) ", %%rsp\n\t" 6446 /* Counter is xmm13 */ 6447 "vpxor %%xmm13, %%xmm13, %%xmm13\n\t" 6448 "vpxor %%xmm15, %%xmm15, %%xmm15\n\t" 6449 "movl %[ibytes], %%edx\n\t" 6450 "cmpl $12, %%edx\n\t" 6451 "jne 35f\n\t" 6452 CALC_IV_12_AVX2() 6453 "jmp 39f\n\t" 6454 "\n" 6455 "35:\n\t" 6456 CALC_IV_AVX2() 6457 "\n" 6458 "39:\n\t" 6459 6460 CALC_AAD_AVX2() 6461 6462 "# Calculate counter and H\n\t" 6463 "vpsrlq $63, " VAR(HR) ", %%xmm5\n\t" 6464 "vpsllq $1, " VAR(HR) ", %%xmm4\n\t" 6465 "vpslldq $8, %%xmm5, %%xmm5\n\t" 6466 "vpor %%xmm5, %%xmm4, %%xmm4\n\t" 6467 "vpshufd $0xff, " VAR(HR) ", " VAR(HR) "\n\t" 6468 "vpsrad $31, " VAR(HR) ", " VAR(HR) "\n\t" 6469 "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" 6470 "vpand %[MOD2_128], " VAR(HR) ", " VAR(HR) "\n\t" 6471 "vpaddd %[ONE], %%xmm13, %%xmm13\n\t" 6472 "vpxor %%xmm4, " VAR(HR) ", " VAR(HR) "\n\t" 6473 "vmovdqu %%xmm13, " VAR(CTR1) "\n\t" 6474 6475 "xorl " VAR(KR) ", " VAR(KR) "\n\t" 6476 6477 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL) 6478 "cmpl $128, %[nbytes]\n\t" 6479 "jl 5f\n\t" 6480 6481 CALC_HT_8_AVX2() 6482 6483 "movl %[nbytes], %%r13d\n\t" 6484 "andl $0xffffff80, %%r13d\n\t" 6485 "\n" 6486 "2:\n\t" 6487 VAESENC_128_GHASH_AVX2(%%rcx, 128) 6488 "addl $128, " VAR(KR) "\n\t" 6489 "cmpl %%r13d, " VAR(KR) "\n\t" 6490 "jl 2b\n\t" 6491 6492 "vmovdqa %%xmm2, " VAR(XR) "\n\t" 6493 "vmovdqu (%%rsp), " VAR(HR) "\n\t" 6494 "5:\n\t" 6495 "movl %[nbytes], %%edx\n\t" 6496 "cmpl %%edx, " VAR(KR) "\n\t" 6497 "jge 55f\n\t" 6498 #endif 6499 "movl %[nbytes], %%r13d\n\t" 6500 "andl $0xfffffff0, %%r13d\n\t" 6501 "cmpl %%r13d, " VAR(KR) "\n\t" 6502 "jge 13f\n\t" 6503 6504 "vmovdqa %[MOD2_128], %%xmm0\n\t" 6505 "\n" 6506 "12:\n\t" 6507 "vmovdqu (%[in]," VAR(KR64) ",1), %%xmm9\n\t" 6508 "vmovdqu " VAR(CTR1) ", %%xmm5\n\t" 6509 "vpshufb %[BSWAP_MASK], %%xmm9, %%xmm1\n\t" 6510 "vpshufb %[BSWAP_EPI64], %%xmm5, %%xmm4\n\t" 6511 "vpaddd %[ONE], %%xmm5, %%xmm5\n\t" 6512 "vpxor " VAR(XR) ", %%xmm1, %%xmm1\n\t" 6513 "vmovdqu %%xmm5, " VAR(CTR1) "\n\t" 6514 VAESENC_GFMUL_SB_AVX2(%%xmm9, HR, %%xmm1, CTR1) 6515 "vmovdqu %%xmm4, (%[out]," VAR(KR64) ",1)\n\t" 6516 "addl $16, " VAR(KR) "\n\t" 6517 "cmpl %%r13d, " VAR(KR) "\n\t" 6518 "jl 12b\n\t" 6519 "\n" 6520 "13:\n\t" 6521 6522 AESENC_LAST15_DEC_AVX2() 6523 "\n" 6524 "55:\n\t" 6525 6526 CALC_TAG_AVX2() 6527 "addq $" VAR(STACK_OFFSET) ", %%rsp\n\t" 6528 "popq %%rdx\n\t" 6529 CMP_TAG_AVX() 6530 "vzeroupper\n\t" 6531 6532 : 6533 : [KEY] "r" (key), 6534 [in] "r" (in), [out] "r" (out), [nr] "r" (nr), 6535 [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt), 6536 [ivec] "r" (iv), [ibytes] "r" (ivLen), [tbytes] "r" (tagLen), 6537 [tag] "r" (tag), [res] "r" (res), 6538 [BSWAP_MASK] "m" (BSWAP_MASK), 6539 [BSWAP_EPI64] "m" (BSWAP_EPI64), 6540 [ONE] "m" (ONE), 6541 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL) 6542 [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR), 6543 [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN), 6544 [EIGHT] "m" (EIGHT), 6545 #endif 6546 [MOD2_128] "m" (MOD2_128) 6547 : "xmm15", "xmm14", "xmm13", "xmm12", 6548 "xmm0", "xmm1", "xmm2", "xmm3", "memory", 6549 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", 6550 "rcx", "r13" 6551 ); 6552 } 6553 #endif /* HAVE_INTEL_AVX2 */ 6554 #endif /* HAVE_INTEL_AVX1 */ 6555 #endif /* HAVE_AES_DECRYPT */ 6556 6557 #else /* _MSC_VER */ 6558 /* The following are for MSC based builds which do not allow 6559 * inline assembly. Intrinsic functions are used instead. */ 6560 6561 #define aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T) \ 6562 do \ 6563 { \ 6564 word32 iv12[4]; \ 6565 iv12[0] = *(word32*)&ivec[0]; \ 6566 iv12[1] = *(word32*)&ivec[4]; \ 6567 iv12[2] = *(word32*)&ivec[8]; \ 6568 iv12[3] = 0x01000000; \ 6569 Y = _mm_loadu_si128((__m128i*)iv12); \ 6570 \ 6571 /* (Compute E[ZERO, KS] and E[Y0, KS] together */ \ 6572 tmp1 = _mm_load_si128(&KEY[0]); \ 6573 tmp2 = _mm_xor_si128(Y, KEY[0]); \ 6574 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ 6575 tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); \ 6576 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); \ 6577 tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); \ 6578 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); \ 6579 tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); \ 6580 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); \ 6581 tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); \ 6582 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); \ 6583 tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); \ 6584 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); \ 6585 tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); \ 6586 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); \ 6587 tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); \ 6588 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); \ 6589 tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); \ 6590 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); \ 6591 tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); \ 6592 lastKey = KEY[10]; \ 6593 if (nr > 10) { \ 6594 tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ 6595 tmp2 = _mm_aesenc_si128(tmp2, lastKey); \ 6596 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); \ 6597 tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); \ 6598 lastKey = KEY[12]; \ 6599 if (nr > 12) { \ 6600 tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ 6601 tmp2 = _mm_aesenc_si128(tmp2, lastKey); \ 6602 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); \ 6603 tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); \ 6604 lastKey = KEY[14]; \ 6605 } \ 6606 } \ 6607 H = _mm_aesenclast_si128(tmp1, lastKey); \ 6608 T = _mm_aesenclast_si128(tmp2, lastKey); \ 6609 H = _mm_shuffle_epi8(H, BSWAP_MASK); \ 6610 } \ 6611 while (0) 6612 6613 #define aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T) \ 6614 do \ 6615 { \ 6616 if (ibytes % 16) { \ 6617 i = ibytes / 16; \ 6618 for (j=0; j < (int)(ibytes%16); j++) \ 6619 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; \ 6620 } \ 6621 tmp1 = _mm_load_si128(&KEY[0]); \ 6622 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ 6623 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); \ 6624 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); \ 6625 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); \ 6626 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); \ 6627 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); \ 6628 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); \ 6629 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); \ 6630 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); \ 6631 lastKey = KEY[10]; \ 6632 if (nr > 10) { \ 6633 tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ 6634 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); \ 6635 lastKey = KEY[12]; \ 6636 if (nr > 12) { \ 6637 tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ 6638 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); \ 6639 lastKey = KEY[14]; \ 6640 } \ 6641 } \ 6642 H = _mm_aesenclast_si128(tmp1, lastKey); \ 6643 H = _mm_shuffle_epi8(H, BSWAP_MASK); \ 6644 Y = _mm_setzero_si128(); \ 6645 for (i=0; i < (int)(ibytes/16); i++) { \ 6646 tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); \ 6647 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); \ 6648 Y = _mm_xor_si128(Y, tmp1); \ 6649 Y = gfmul_sw(Y, H); \ 6650 } \ 6651 if (ibytes % 16) { \ 6652 tmp1 = last_block; \ 6653 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); \ 6654 Y = _mm_xor_si128(Y, tmp1); \ 6655 Y = gfmul_sw(Y, H); \ 6656 } \ 6657 tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); \ 6658 tmp1 = _mm_insert_epi64(tmp1, 0, 1); \ 6659 Y = _mm_xor_si128(Y, tmp1); \ 6660 Y = gfmul_sw(Y, H); \ 6661 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ \ 6662 tmp1 = _mm_xor_si128(Y, KEY[0]); \ 6663 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ 6664 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); \ 6665 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); \ 6666 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); \ 6667 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); \ 6668 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); \ 6669 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); \ 6670 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); \ 6671 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); \ 6672 lastKey = KEY[10]; \ 6673 if (nr > 10) { \ 6674 tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ 6675 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); \ 6676 lastKey = KEY[12]; \ 6677 if (nr > 12) { \ 6678 tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ 6679 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); \ 6680 lastKey = KEY[14]; \ 6681 } \ 6682 } \ 6683 T = _mm_aesenclast_si128(tmp1, lastKey); \ 6684 } \ 6685 while (0) 6686 6687 #define AES_ENC_8(j) \ 6688 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); \ 6689 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); \ 6690 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); \ 6691 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); \ 6692 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); \ 6693 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); \ 6694 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); \ 6695 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); 6696 6697 #define AES_ENC_LAST_8() \ 6698 tmp1 =_mm_aesenclast_si128(tmp1, lastKey); \ 6699 tmp2 =_mm_aesenclast_si128(tmp2, lastKey); \ 6700 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*8+0])); \ 6701 tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*8+1])); \ 6702 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); \ 6703 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); \ 6704 tmp3 =_mm_aesenclast_si128(tmp3, lastKey); \ 6705 tmp4 =_mm_aesenclast_si128(tmp4, lastKey); \ 6706 tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*8+2])); \ 6707 tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*8+3])); \ 6708 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); \ 6709 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); \ 6710 tmp5 =_mm_aesenclast_si128(tmp5, lastKey); \ 6711 tmp6 =_mm_aesenclast_si128(tmp6, lastKey); \ 6712 tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[i*8+4])); \ 6713 tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[i*8+5])); \ 6714 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); \ 6715 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); \ 6716 tmp7 =_mm_aesenclast_si128(tmp7, lastKey); \ 6717 tmp8 =_mm_aesenclast_si128(tmp8, lastKey); \ 6718 tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[i*8+6])); \ 6719 tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[i*8+7])); \ 6720 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); \ 6721 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 6722 3419 6723 3420 6724 static __m128i gfmul_sw(__m128i a, __m128i b) 3421 6725 { 3422 6726 __m128i r, t1, t2, t3, t4, t5, t6, t7; 3423 #ifndef WOLFSSL_AES_GCM_SLOW_CLMUL3424 /* 128 x 128 Carryless Multiply */3425 t3 = _mm_clmulepi64_si128(a, b, 0x10);3426 t2 = _mm_clmulepi64_si128(a, b, 0x01);3427 t1 = _mm_clmulepi64_si128(a, b, 0x00);3428 t4 = _mm_clmulepi64_si128(a, b, 0x11);3429 t3 = _mm_xor_si128(t3, t2);3430 t2 = _mm_slli_si128(t3, 8);3431 t3 = _mm_srli_si128(t3, 8);3432 t1 = _mm_xor_si128(t1, t2);3433 t4 = _mm_xor_si128(t4, t3);3434 3435 /* shift left 1 bit - bits reversed */3436 t5 = _mm_srli_epi32(t1, 31);3437 t6 = _mm_srli_epi32(t4, 31);3438 t1 = _mm_slli_epi32(t1, 1);3439 t4 = _mm_slli_epi32(t4, 1);3440 t7 = _mm_srli_si128(t5, 12);3441 t5 = _mm_slli_si128(t5, 4);3442 t6 = _mm_slli_si128(t6, 4);3443 t4 = _mm_or_si128(t4, t7);3444 t1 = _mm_or_si128(t1, t5);3445 t4 = _mm_or_si128(t4, t6);3446 3447 /* Reduction */3448 t2 = _mm_clmulepi64_si128(t1, MOD2_128, 0x10);3449 t3 = _mm_shuffle_epi32(t1, 78);3450 t3 = _mm_xor_si128(t3, t2);3451 t2 = _mm_clmulepi64_si128(t3, MOD2_128, 0x10);3452 t3 = _mm_shuffle_epi32(t3, 78);3453 t3 = _mm_xor_si128(t3, t2);3454 r = _mm_xor_si128(t4, t3);3455 #else3456 6727 t2 = _mm_shuffle_epi32(b, 78); 3457 6728 t3 = _mm_shuffle_epi32(a, 78); … … 3497 6768 t7 = _mm_xor_si128(t7, t1); 3498 6769 r = _mm_xor_si128(t4, t7); 3499 #endif3500 6770 3501 6771 return r; 3502 }6772 } 3503 6773 3504 6774 static void gfmul_only(__m128i a, __m128i b, __m128i* r0, __m128i* r1) … … 3507 6777 3508 6778 /* 128 x 128 Carryless Multiply */ 3509 #ifndef WOLFSSL_AES_GCM_SLOW_CLMUL3510 t3 = _mm_clmulepi64_si128(a, b, 0x10);3511 t2 = _mm_clmulepi64_si128(a, b, 0x01);3512 t1 = _mm_clmulepi64_si128(a, b, 0x00);3513 t4 = _mm_clmulepi64_si128(a, b, 0x11);3514 t3 = _mm_xor_si128(t3, t2);3515 t2 = _mm_slli_si128(t3, 8);3516 t3 = _mm_srli_si128(t3, 8);3517 t1 = _mm_xor_si128(t1, t2);3518 t4 = _mm_xor_si128(t4, t3);3519 #else3520 6779 t2 = _mm_shuffle_epi32(b, 78); 3521 6780 t3 = _mm_shuffle_epi32(a, 78); … … 3531 6790 t1 = _mm_xor_si128(t1, t3); 3532 6791 t4 = _mm_xor_si128(t4, t2); 3533 #endif3534 6792 *r0 = _mm_xor_si128(t1, *r0); 3535 6793 *r1 = _mm_xor_si128(t4, *r1); 3536 }6794 } 3537 6795 3538 6796 static __m128i gfmul_shl1(__m128i a) … … 3549 6807 t1 = _mm_xor_si128(t1, a); 3550 6808 return t1; 3551 }6809 } 3552 6810 3553 6811 static __m128i ghash_red(__m128i r0, __m128i r1) 3554 6812 { 3555 6813 __m128i t2, t3; 3556 #ifndef WOLFSSL_AES_GCM_SLOW_CLMUL3557 t2 = _mm_clmulepi64_si128(r0, MOD2_128, 0x10);3558 t3 = _mm_shuffle_epi32(r0, 78);3559 t3 = _mm_xor_si128(t3, t2);3560 t2 = _mm_clmulepi64_si128(t3, MOD2_128, 0x10);3561 t3 = _mm_shuffle_epi32(t3, 78);3562 t3 = _mm_xor_si128(t3, t2);3563 return _mm_xor_si128(r1, t3);3564 #else3565 6814 __m128i t5, t6, t7; 3566 6815 … … 3583 6832 t7 = _mm_xor_si128(t7, r0); 3584 6833 return _mm_xor_si128(r1, t7); 3585 #endif 3586 } 6834 } 3587 6835 3588 6836 static __m128i gfmul_shifted(__m128i a, __m128i b) … … 3591 6839 gfmul_only(a, b, &t0, &t1); 3592 6840 return ghash_red(t0, t1); 3593 }6841 } 3594 6842 3595 6843 #ifndef AES_GCM_AESNI_NO_UNROLL … … 3609 6857 gfmul_only(a8, b1, &t0, &t1); 3610 6858 return ghash_red(t0, t1); 3611 }6859 } 3612 6860 #endif 3613 6861 3614 /* See Intel® Carry-Less Multiplication Instruction 3615 * and its Usage for Computing the GCM Mode White Paper 3616 * by Shay Gueron, Intel Mobility Group, Israel Development Center; 3617 * and Michael E. Kounavis, Intel Labs, Circuits and Systems Research */ 3618 3619 3620 /* Figure 9. AES-GCM – Encrypt With Single Block Ghash at a Time */ 3621 3622 static const __m128i ONE = M128_INIT(0x0, 0x1); 3623 #ifndef AES_GCM_AESNI_NO_UNROLL 3624 static const __m128i TWO = M128_INIT(0x0, 0x2); 3625 static const __m128i THREE = M128_INIT(0x0, 0x3); 3626 static const __m128i FOUR = M128_INIT(0x0, 0x4); 3627 static const __m128i FIVE = M128_INIT(0x0, 0x5); 3628 static const __m128i SIX = M128_INIT(0x0, 0x6); 3629 static const __m128i SEVEN = M128_INIT(0x0, 0x7); 3630 static const __m128i EIGHT = M128_INIT(0x0, 0x8); 3631 #endif 3632 static const __m128i BSWAP_EPI64 = M128_INIT(0x0001020304050607, 0x08090a0b0c0d0e0f); 3633 static const __m128i BSWAP_MASK = M128_INIT(0x08090a0b0c0d0e0f, 0x0001020304050607); 3634 3635 static void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, 3636 const unsigned char* addt, 3637 const unsigned char* ivec, 3638 unsigned char *tag, unsigned int nbytes, 3639 unsigned int abytes, unsigned int ibytes, 3640 const unsigned char* key, int nr) 6862 6863 static void AES_GCM_encrypt(const unsigned char *in, 6864 unsigned char *out, 6865 const unsigned char* addt, 6866 const unsigned char* ivec, 6867 unsigned char *tag, unsigned int nbytes, 6868 unsigned int abytes, unsigned int ibytes, 6869 unsigned int tbytes, 6870 const unsigned char* key, int nr) 3641 6871 { 3642 6872 int i, j ,k; … … 3654 6884 #endif 3655 6885 3656 if (ibytes == 12) { 3657 Y = _mm_setzero_si128(); 3658 for (j=0; j < 12; j++) 3659 ((unsigned char*)&Y)[j] = ivec[j]; 3660 Y = _mm_insert_epi32(Y, 0x1000000, 3); 3661 /* (Compute E[ZERO, KS] and E[Y0, KS] together */ 3662 tmp1 = _mm_xor_si128(X, KEY[0]); 3663 tmp2 = _mm_xor_si128(Y, KEY[0]); 3664 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); 3665 tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); 3666 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); 3667 tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); 3668 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); 3669 tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); 3670 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); 3671 tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); 3672 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); 3673 tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); 3674 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); 3675 tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); 3676 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); 3677 tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); 3678 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); 3679 tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); 3680 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); 3681 tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); 3682 lastKey = KEY[10]; 3683 if (nr > 10) { 3684 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 3685 tmp2 = _mm_aesenc_si128(tmp2, lastKey); 3686 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); 3687 tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); 3688 lastKey = KEY[12]; 3689 if (nr > 12) { 3690 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 3691 tmp2 = _mm_aesenc_si128(tmp2, lastKey); 3692 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); 3693 tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); 3694 lastKey = KEY[14]; 3695 } 3696 } 3697 H = _mm_aesenclast_si128(tmp1, lastKey); 3698 T = _mm_aesenclast_si128(tmp2, lastKey); 3699 H = _mm_shuffle_epi8(H, BSWAP_MASK); 3700 } 3701 else { 3702 if (ibytes % 16) { 3703 i = ibytes / 16; 3704 for (j=0; j < (int)(ibytes%16); j++) 3705 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 3706 } 3707 tmp1 = _mm_xor_si128(X, KEY[0]); 3708 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); 3709 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); 3710 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); 3711 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); 3712 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); 3713 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); 3714 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); 3715 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); 3716 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); 3717 lastKey = KEY[10]; 3718 if (nr > 10) { 3719 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 3720 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); 3721 lastKey = KEY[12]; 3722 if (nr > 12) { 3723 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 3724 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); 3725 lastKey = KEY[14]; 3726 } 3727 } 3728 H = _mm_aesenclast_si128(tmp1, lastKey); 3729 H = _mm_shuffle_epi8(H, BSWAP_MASK); 3730 Y = _mm_setzero_si128(); 3731 for (i=0; i < (int)(ibytes/16); i++) { 3732 tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); 3733 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 3734 Y = _mm_xor_si128(Y, tmp1); 3735 Y = gfmul_sw(Y, H); 3736 } 3737 if (ibytes % 16) { 3738 tmp1 = last_block; 3739 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 3740 Y = _mm_xor_si128(Y, tmp1); 3741 Y = gfmul_sw(Y, H); 3742 } 3743 tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); 3744 tmp1 = _mm_insert_epi64(tmp1, 0, 1); 3745 Y = _mm_xor_si128(Y, tmp1); 3746 Y = gfmul_sw(Y, H); 3747 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ 3748 tmp1 = _mm_xor_si128(Y, KEY[0]); 3749 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); 3750 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); 3751 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); 3752 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); 3753 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); 3754 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); 3755 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); 3756 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); 3757 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); 3758 lastKey = KEY[10]; 3759 if (nr > 10) { 3760 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 3761 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); 3762 lastKey = KEY[12]; 3763 if (nr > 12) { 3764 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 3765 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); 3766 lastKey = KEY[14]; 3767 } 3768 } 3769 T = _mm_aesenclast_si128(tmp1, lastKey); 3770 } 6886 if (ibytes == 12) 6887 aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T); 6888 else 6889 aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T); 3771 6890 3772 6891 for (i=0; i < (int)(abytes/16); i++) { … … 3785 6904 X = gfmul_sw(X, H); 3786 6905 } 3787 3788 6906 tmp1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 3789 6907 ctr1 = _mm_add_epi32(tmp1, ONE); … … 3826 6944 tmp7 =_mm_xor_si128(tmp7, KEY[0]); 3827 6945 tmp8 =_mm_xor_si128(tmp8, KEY[0]); 3828 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); 3829 tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); 3830 tmp3 = _mm_aesenc_si128(tmp3, KEY[1]); 3831 tmp4 = _mm_aesenc_si128(tmp4, KEY[1]); 3832 tmp5 = _mm_aesenc_si128(tmp5, KEY[1]); 3833 tmp6 = _mm_aesenc_si128(tmp6, KEY[1]); 3834 tmp7 = _mm_aesenc_si128(tmp7, KEY[1]); 3835 tmp8 = _mm_aesenc_si128(tmp8, KEY[1]); 3836 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); 3837 tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); 3838 tmp3 = _mm_aesenc_si128(tmp3, KEY[2]); 3839 tmp4 = _mm_aesenc_si128(tmp4, KEY[2]); 3840 tmp5 = _mm_aesenc_si128(tmp5, KEY[2]); 3841 tmp6 = _mm_aesenc_si128(tmp6, KEY[2]); 3842 tmp7 = _mm_aesenc_si128(tmp7, KEY[2]); 3843 tmp8 = _mm_aesenc_si128(tmp8, KEY[2]); 3844 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); 3845 tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); 3846 tmp3 = _mm_aesenc_si128(tmp3, KEY[3]); 3847 tmp4 = _mm_aesenc_si128(tmp4, KEY[3]); 3848 tmp5 = _mm_aesenc_si128(tmp5, KEY[3]); 3849 tmp6 = _mm_aesenc_si128(tmp6, KEY[3]); 3850 tmp7 = _mm_aesenc_si128(tmp7, KEY[3]); 3851 tmp8 = _mm_aesenc_si128(tmp8, KEY[3]); 3852 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); 3853 tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); 3854 tmp3 = _mm_aesenc_si128(tmp3, KEY[4]); 3855 tmp4 = _mm_aesenc_si128(tmp4, KEY[4]); 3856 tmp5 = _mm_aesenc_si128(tmp5, KEY[4]); 3857 tmp6 = _mm_aesenc_si128(tmp6, KEY[4]); 3858 tmp7 = _mm_aesenc_si128(tmp7, KEY[4]); 3859 tmp8 = _mm_aesenc_si128(tmp8, KEY[4]); 3860 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); 3861 tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); 3862 tmp3 = _mm_aesenc_si128(tmp3, KEY[5]); 3863 tmp4 = _mm_aesenc_si128(tmp4, KEY[5]); 3864 tmp5 = _mm_aesenc_si128(tmp5, KEY[5]); 3865 tmp6 = _mm_aesenc_si128(tmp6, KEY[5]); 3866 tmp7 = _mm_aesenc_si128(tmp7, KEY[5]); 3867 tmp8 = _mm_aesenc_si128(tmp8, KEY[5]); 3868 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); 3869 tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); 3870 tmp3 = _mm_aesenc_si128(tmp3, KEY[6]); 3871 tmp4 = _mm_aesenc_si128(tmp4, KEY[6]); 3872 tmp5 = _mm_aesenc_si128(tmp5, KEY[6]); 3873 tmp6 = _mm_aesenc_si128(tmp6, KEY[6]); 3874 tmp7 = _mm_aesenc_si128(tmp7, KEY[6]); 3875 tmp8 = _mm_aesenc_si128(tmp8, KEY[6]); 3876 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); 3877 tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); 3878 tmp3 = _mm_aesenc_si128(tmp3, KEY[7]); 3879 tmp4 = _mm_aesenc_si128(tmp4, KEY[7]); 3880 tmp5 = _mm_aesenc_si128(tmp5, KEY[7]); 3881 tmp6 = _mm_aesenc_si128(tmp6, KEY[7]); 3882 tmp7 = _mm_aesenc_si128(tmp7, KEY[7]); 3883 tmp8 = _mm_aesenc_si128(tmp8, KEY[7]); 3884 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); 3885 tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); 3886 tmp3 = _mm_aesenc_si128(tmp3, KEY[8]); 3887 tmp4 = _mm_aesenc_si128(tmp4, KEY[8]); 3888 tmp5 = _mm_aesenc_si128(tmp5, KEY[8]); 3889 tmp6 = _mm_aesenc_si128(tmp6, KEY[8]); 3890 tmp7 = _mm_aesenc_si128(tmp7, KEY[8]); 3891 tmp8 = _mm_aesenc_si128(tmp8, KEY[8]); 3892 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); 3893 tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); 3894 tmp3 = _mm_aesenc_si128(tmp3, KEY[9]); 3895 tmp4 = _mm_aesenc_si128(tmp4, KEY[9]); 3896 tmp5 = _mm_aesenc_si128(tmp5, KEY[9]); 3897 tmp6 = _mm_aesenc_si128(tmp6, KEY[9]); 3898 tmp7 = _mm_aesenc_si128(tmp7, KEY[9]); 3899 tmp8 = _mm_aesenc_si128(tmp8, KEY[9]); 6946 AES_ENC_8(1); 6947 AES_ENC_8(2); 6948 AES_ENC_8(3); 6949 AES_ENC_8(4); 6950 AES_ENC_8(5); 6951 AES_ENC_8(6); 6952 AES_ENC_8(7); 6953 AES_ENC_8(8); 6954 AES_ENC_8(9); 3900 6955 lastKey = KEY[10]; 3901 6956 if (nr > 10) { 3902 tmp1 = _mm_aesenc_si128(tmp1, KEY[10]); 3903 tmp2 = _mm_aesenc_si128(tmp2, KEY[10]); 3904 tmp3 = _mm_aesenc_si128(tmp3, KEY[10]); 3905 tmp4 = _mm_aesenc_si128(tmp4, KEY[10]); 3906 tmp5 = _mm_aesenc_si128(tmp5, KEY[10]); 3907 tmp6 = _mm_aesenc_si128(tmp6, KEY[10]); 3908 tmp7 = _mm_aesenc_si128(tmp7, KEY[10]); 3909 tmp8 = _mm_aesenc_si128(tmp8, KEY[10]); 3910 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); 3911 tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); 3912 tmp3 = _mm_aesenc_si128(tmp3, KEY[11]); 3913 tmp4 = _mm_aesenc_si128(tmp4, KEY[11]); 3914 tmp5 = _mm_aesenc_si128(tmp5, KEY[11]); 3915 tmp6 = _mm_aesenc_si128(tmp6, KEY[11]); 3916 tmp7 = _mm_aesenc_si128(tmp7, KEY[11]); 3917 tmp8 = _mm_aesenc_si128(tmp8, KEY[11]); 6957 AES_ENC_8(10); 6958 AES_ENC_8(11); 3918 6959 lastKey = KEY[12]; 3919 6960 if (nr > 12) { 3920 tmp1 = _mm_aesenc_si128(tmp1, KEY[12]); 3921 tmp2 = _mm_aesenc_si128(tmp2, KEY[12]); 3922 tmp3 = _mm_aesenc_si128(tmp3, KEY[12]); 3923 tmp4 = _mm_aesenc_si128(tmp4, KEY[12]); 3924 tmp5 = _mm_aesenc_si128(tmp5, KEY[12]); 3925 tmp6 = _mm_aesenc_si128(tmp6, KEY[12]); 3926 tmp7 = _mm_aesenc_si128(tmp7, KEY[12]); 3927 tmp8 = _mm_aesenc_si128(tmp8, KEY[12]); 3928 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); 3929 tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); 3930 tmp3 = _mm_aesenc_si128(tmp3, KEY[13]); 3931 tmp4 = _mm_aesenc_si128(tmp4, KEY[13]); 3932 tmp5 = _mm_aesenc_si128(tmp5, KEY[13]); 3933 tmp6 = _mm_aesenc_si128(tmp6, KEY[13]); 3934 tmp7 = _mm_aesenc_si128(tmp7, KEY[13]); 3935 tmp8 = _mm_aesenc_si128(tmp8, KEY[13]); 6961 AES_ENC_8(12); 6962 AES_ENC_8(13); 3936 6963 lastKey = KEY[14]; 3937 6964 } 3938 6965 } 3939 tmp1 =_mm_aesenclast_si128(tmp1, lastKey); 3940 tmp2 =_mm_aesenclast_si128(tmp2, lastKey); 3941 tmp3 =_mm_aesenclast_si128(tmp3, lastKey); 3942 tmp4 =_mm_aesenclast_si128(tmp4, lastKey); 3943 tmp5 =_mm_aesenclast_si128(tmp5, lastKey); 3944 tmp6 =_mm_aesenclast_si128(tmp6, lastKey); 3945 tmp7 =_mm_aesenclast_si128(tmp7, lastKey); 3946 tmp8 =_mm_aesenclast_si128(tmp8, lastKey); 3947 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[0])); 3948 tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[1])); 3949 tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[2])); 3950 tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[3])); 3951 tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[4])); 3952 tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[5])); 3953 tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[6])); 3954 tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[7])); 3955 _mm_storeu_si128(&((__m128i*)out)[0], tmp1); 3956 _mm_storeu_si128(&((__m128i*)out)[1], tmp2); 3957 _mm_storeu_si128(&((__m128i*)out)[2], tmp3); 3958 _mm_storeu_si128(&((__m128i*)out)[3], tmp4); 3959 _mm_storeu_si128(&((__m128i*)out)[4], tmp5); 3960 _mm_storeu_si128(&((__m128i*)out)[5], tmp6); 3961 _mm_storeu_si128(&((__m128i*)out)[6], tmp7); 3962 _mm_storeu_si128(&((__m128i*)out)[7], tmp8); 6966 AES_ENC_LAST_8(); 3963 6967 3964 6968 for (i=1; i < (int)(nbytes/16/8); i++) { … … 4135 7139 } 4136 7140 } 4137 tmp1 =_mm_aesenclast_si128(tmp1, lastKey); 4138 tmp2 =_mm_aesenclast_si128(tmp2, lastKey); 4139 tmp3 =_mm_aesenclast_si128(tmp3, lastKey); 4140 tmp4 =_mm_aesenclast_si128(tmp4, lastKey); 4141 tmp5 =_mm_aesenclast_si128(tmp5, lastKey); 4142 tmp6 =_mm_aesenclast_si128(tmp6, lastKey); 4143 tmp7 =_mm_aesenclast_si128(tmp7, lastKey); 4144 tmp8 =_mm_aesenclast_si128(tmp8, lastKey); 4145 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*8+0])); 4146 tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*8+1])); 4147 tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*8+2])); 4148 tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*8+3])); 4149 tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[i*8+4])); 4150 tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[i*8+5])); 4151 tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[i*8+6])); 4152 tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[i*8+7])); 4153 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 4154 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 4155 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 4156 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 4157 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 4158 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 4159 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 4160 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 7141 AES_ENC_LAST_8(); 4161 7142 } 4162 7143 … … 4204 7185 X = gfmul_shifted(X, H); 4205 7186 } 4206 #else 7187 #else /* AES_GCM_AESNI_NO_UNROLL */ 4207 7188 for (k = 0; k < (int)(nbytes/16) && k < 1; k++) { 4208 7189 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); … … 4233 7214 _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 4234 7215 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 4235 X =_mm_xor_si128(X, tmp1);4236 }7216 X = _mm_xor_si128(X, tmp1); 7217 } 4237 7218 for (; k < (int)(nbytes/16); k++) { 4238 7219 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); … … 4263 7244 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); 4264 7245 _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 4265 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);7246 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 4266 7247 X =_mm_xor_si128(X, tmp1); 4267 }7248 } 4268 7249 if (k > 0) { 4269 7250 X = gfmul_shifted(X, H); 4270 } 4271 #endif 7251 } 7252 #endif /* AES_GCM_AESNI_NO_UNROLL */ 7253 4272 7254 /* If one partial block remains */ 4273 7255 if (nbytes % 16) { … … 4313 7295 X = _mm_shuffle_epi8(X, BSWAP_MASK); 4314 7296 T = _mm_xor_si128(X, T); 4315 _mm_storeu_si128((__m128i*)tag, T); 4316 } 4317 4318 #ifdef HAVE_INTEL_AVX2 4319 /* Encrypt with key in xmm12. */ 4320 #define VAESENC() \ 4321 "vaesenc %%xmm12, %[tmp1], %[tmp1]\n\t" \ 4322 "vaesenc %%xmm12, %[tmp2], %[tmp2]\n\t" \ 4323 "vaesenc %%xmm12, %[tmp3], %[tmp3]\n\t" \ 4324 "vaesenc %%xmm12, %[tmp4], %[tmp4]\n\t" \ 4325 "vaesenc %%xmm12, %[tmp5], %[tmp5]\n\t" \ 4326 "vaesenc %%xmm12, %[tmp6], %[tmp6]\n\t" \ 4327 "vaesenc %%xmm12, %[tmp7], %[tmp7]\n\t" \ 4328 "vaesenc %%xmm12, %[tmp8], %[tmp8]\n\t" 4329 4330 4331 /* Encrypt and GCM mul with the nth round key. */ 4332 #define VAESENC_PCLMUL_N(o1, o2, o3) \ 4333 "vmovaps "#o1"(%[KEY]), %%xmm12\n\t" \ 4334 "vmovdqu "#o2"(%[out]), %%xmm1\n\t" \ 4335 "vaesenc %%xmm12, %[tmp1], %[tmp1]\n\t" \ 4336 "vmovaps "#o3"(%[HT]), %%xmm0\n\t" \ 4337 "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" \ 4338 "vaesenc %%xmm12, %[tmp2], %[tmp2]\n\t" \ 4339 "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" \ 4340 "vaesenc %%xmm12, %[tmp3], %[tmp3]\n\t" \ 4341 "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" \ 4342 "vaesenc %%xmm12, %[tmp4], %[tmp4]\n\t" \ 4343 "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" \ 4344 "vaesenc %%xmm12, %[tmp5], %[tmp5]\n\t" \ 4345 "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" \ 4346 "vaesenc %%xmm12, %[tmp6], %[tmp6]\n\t" \ 4347 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" \ 4348 "vpslldq $8, %%xmm13, %%xmm14\n\t" \ 4349 "vpsrldq $8, %%xmm13, %%xmm13\n\t" \ 4350 "vaesenc %%xmm12, %[tmp7], %[tmp7]\n\t" \ 4351 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" \ 4352 "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" \ 4353 "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ 4354 "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" \ 4355 "vaesenc %%xmm12, %[tmp8], %[tmp8]\n\t" 4356 4357 static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, 4358 const unsigned char* addt, 4359 const unsigned char* ivec, 4360 unsigned char *tag, unsigned int nbytes, 4361 unsigned int abytes, unsigned int ibytes, 4362 const unsigned char* key, int nr) 4363 { 4364 int i, j ,k; 4365 __m128i ctr1; 4366 __m128i H, Y, T; 4367 __m128i X = _mm_setzero_si128(); 4368 __m128i *KEY = (__m128i*)key, lastKey; 4369 __m128i last_block = _mm_setzero_si128(); 4370 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL) 4371 __m128i HT[8]; 4372 register __m128i tmp1 asm("xmm4"); 4373 register __m128i tmp2 asm("xmm5"); 4374 register __m128i tmp3 asm("xmm6"); 4375 register __m128i tmp4 asm("xmm7"); 4376 register __m128i tmp5 asm("xmm8"); 4377 register __m128i tmp6 asm("xmm9"); 4378 register __m128i tmp7 asm("xmm10"); 4379 register __m128i tmp8 asm("xmm11"); 4380 __m128i pctr1[1]; 4381 register __m128i XV asm("xmm2"); 4382 #else 4383 __m128i tmp1, tmp2; 4384 #endif 4385 4386 if (ibytes == 12) { 4387 Y = _mm_setzero_si128(); 4388 for (j=0; j < 12; j++) 4389 ((unsigned char*)&Y)[j] = ivec[j]; 4390 Y = _mm_insert_epi32(Y, 0x1000000, 3); 4391 /* (Compute E[ZERO, KS] and E[Y0, KS] together */ 4392 tmp1 = _mm_xor_si128(X, KEY[0]); 4393 tmp2 = _mm_xor_si128(Y, KEY[0]); 4394 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); 4395 tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); 4396 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); 4397 tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); 4398 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); 4399 tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); 4400 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); 4401 tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); 4402 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); 4403 tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); 4404 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); 4405 tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); 4406 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); 4407 tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); 4408 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); 4409 tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); 4410 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); 4411 tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); 4412 lastKey = KEY[10]; 4413 if (nr > 10) { 4414 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 4415 tmp2 = _mm_aesenc_si128(tmp2, lastKey); 4416 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); 4417 tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); 4418 lastKey = KEY[12]; 4419 if (nr > 12) { 4420 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 4421 tmp2 = _mm_aesenc_si128(tmp2, lastKey); 4422 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); 4423 tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); 4424 lastKey = KEY[14]; 4425 } 4426 } 4427 H = _mm_aesenclast_si128(tmp1, lastKey); 4428 T = _mm_aesenclast_si128(tmp2, lastKey); 4429 H = _mm_shuffle_epi8(H, BSWAP_MASK); 4430 } 4431 else { 4432 if (ibytes % 16) { 4433 i = ibytes / 16; 4434 for (j=0; j < (int)(ibytes%16); j++) 4435 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 4436 } 4437 tmp1 = _mm_xor_si128(X, KEY[0]); 4438 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); 4439 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); 4440 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); 4441 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); 4442 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); 4443 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); 4444 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); 4445 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); 4446 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); 4447 lastKey = KEY[10]; 4448 if (nr > 10) { 4449 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 4450 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); 4451 lastKey = KEY[12]; 4452 if (nr > 12) { 4453 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 4454 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); 4455 lastKey = KEY[14]; 4456 } 4457 } 4458 H = _mm_aesenclast_si128(tmp1, lastKey); 4459 H = _mm_shuffle_epi8(H, BSWAP_MASK); 4460 Y = _mm_setzero_si128(); 4461 for (i=0; i < (int)(ibytes/16); i++) { 4462 tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); 4463 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 4464 Y = _mm_xor_si128(Y, tmp1); 4465 Y = gfmul_sw(Y, H); 4466 } 4467 if (ibytes % 16) { 4468 tmp1 = last_block; 4469 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 4470 Y = _mm_xor_si128(Y, tmp1); 4471 Y = gfmul_sw(Y, H); 4472 } 4473 tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); 4474 tmp1 = _mm_insert_epi64(tmp1, 0, 1); 4475 Y = _mm_xor_si128(Y, tmp1); 4476 Y = gfmul_sw(Y, H); 4477 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ 4478 tmp1 = _mm_xor_si128(Y, KEY[0]); 4479 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); 4480 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); 4481 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); 4482 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); 4483 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); 4484 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); 4485 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); 4486 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); 4487 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); 4488 lastKey = KEY[10]; 4489 if (nr > 10) { 4490 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 4491 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); 4492 lastKey = KEY[12]; 4493 if (nr > 12) { 4494 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 4495 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); 4496 lastKey = KEY[14]; 4497 } 4498 } 4499 T = _mm_aesenclast_si128(tmp1, lastKey); 4500 } 4501 4502 for (i=0; i < (int)(abytes/16); i++) { 4503 tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); 4504 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 4505 X = _mm_xor_si128(X, tmp1); 4506 X = gfmul_sw(X, H); 4507 } 4508 if (abytes%16) { 4509 last_block = _mm_setzero_si128(); 4510 for (j=0; j < (int)(abytes%16); j++) 4511 ((unsigned char*)&last_block)[j] = addt[i*16+j]; 4512 tmp1 = last_block; 4513 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 4514 X = _mm_xor_si128(X, tmp1); 4515 X = gfmul_sw(X, H); 4516 } 4517 4518 tmp1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 4519 ctr1 = _mm_add_epi32(tmp1, ONE); 4520 H = gfmul_shl1(H); 4521 4522 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL) 4523 i = 0; 4524 if (nbytes >= 16*8) { 4525 HT[0] = H; 4526 HT[1] = gfmul_shifted(H, H); 4527 HT[2] = gfmul_shifted(H, HT[1]); 4528 HT[3] = gfmul_shifted(HT[1], HT[1]); 4529 HT[4] = gfmul_shifted(HT[1], HT[2]); 4530 HT[5] = gfmul_shifted(HT[2], HT[2]); 4531 HT[6] = gfmul_shifted(HT[2], HT[3]); 4532 HT[7] = gfmul_shifted(HT[3], HT[3]); 4533 4534 pctr1[0] = ctr1; 4535 __asm__ __volatile__ ( 4536 "vmovaps (%[pctr1]), %%xmm0\n\t" 4537 "vmovaps %[BSWAP_EPI64], %%xmm1\n\t" 4538 "vpshufb %%xmm1, %%xmm0, %[tmp1]\n\t" 4539 "vpaddd %[ONE], %%xmm0, %[tmp2]\n\t" 4540 "vpshufb %%xmm1, %[tmp2], %[tmp2]\n\t" 4541 "vpaddd %[TWO], %%xmm0, %[tmp3]\n\t" 4542 "vpshufb %%xmm1, %[tmp3], %[tmp3]\n\t" 4543 "vpaddd %[THREE], %%xmm0, %[tmp4]\n\t" 4544 "vpshufb %%xmm1, %[tmp4], %[tmp4]\n\t" 4545 "vpaddd %[FOUR], %%xmm0, %[tmp5]\n\t" 4546 "vpshufb %%xmm1, %[tmp5], %[tmp5]\n\t" 4547 "vpaddd %[FIVE], %%xmm0, %[tmp6]\n\t" 4548 "vpshufb %%xmm1, %[tmp6], %[tmp6]\n\t" 4549 "vpaddd %[SIX], %%xmm0, %[tmp7]\n\t" 4550 "vpshufb %%xmm1, %[tmp7], %[tmp7]\n\t" 4551 "vpaddd %[SEVEN], %%xmm0, %[tmp8]\n\t" 4552 "vpshufb %%xmm1, %[tmp8], %[tmp8]\n\t" 4553 "vpaddd %[EIGHT], %%xmm0, %%xmm0\n\t" 4554 4555 "vmovaps (%[KEY]), %%xmm1\n\t" 4556 "vmovaps %%xmm0, (%[pctr1])\n\t" 4557 "vpxor %%xmm1, %[tmp1], %[tmp1]\n\t" 4558 "vpxor %%xmm1, %[tmp2], %[tmp2]\n\t" 4559 "vpxor %%xmm1, %[tmp3], %[tmp3]\n\t" 4560 "vpxor %%xmm1, %[tmp4], %[tmp4]\n\t" 4561 "vpxor %%xmm1, %[tmp5], %[tmp5]\n\t" 4562 "vpxor %%xmm1, %[tmp6], %[tmp6]\n\t" 4563 "vpxor %%xmm1, %[tmp7], %[tmp7]\n\t" 4564 "vpxor %%xmm1, %[tmp8], %[tmp8]\n\t" 4565 4566 "vmovaps 16(%[KEY]), %%xmm12\n\t" 4567 VAESENC() 4568 "vmovaps 32(%[KEY]), %%xmm12\n\t" 4569 VAESENC() 4570 "vmovaps 48(%[KEY]), %%xmm12\n\t" 4571 VAESENC() 4572 "vmovaps 64(%[KEY]), %%xmm12\n\t" 4573 VAESENC() 4574 "vmovaps 80(%[KEY]), %%xmm12\n\t" 4575 VAESENC() 4576 "vmovaps 96(%[KEY]), %%xmm12\n\t" 4577 VAESENC() 4578 "vmovaps 112(%[KEY]), %%xmm12\n\t" 4579 VAESENC() 4580 "vmovaps 128(%[KEY]), %%xmm12\n\t" 4581 VAESENC() 4582 "vmovaps 144(%[KEY]), %%xmm12\n\t" 4583 VAESENC() 4584 "cmpl $11, %[nr]\n\t" 4585 "vmovaps 160(%[KEY]), %%xmm12\n\t" 4586 "jl L_enc128_enclast\n\t" 4587 4588 VAESENC() 4589 "vmovaps 176(%[KEY]), %%xmm12\n\t" 4590 VAESENC() 4591 "cmpl $13, %[nr]\n\t" 4592 "vmovaps 192(%[KEY]), %%xmm12\n\t" 4593 "jl L_enc128_enclast\n\t" 4594 4595 VAESENC() 4596 "vmovaps 208(%[KEY]), %%xmm12\n\t" 4597 VAESENC() 4598 "vmovaps 224(%[KEY]), %%xmm12\n\t" 4599 "\n" 4600 "L_enc128_enclast:\n\t" 4601 "vaesenclast %%xmm12, %[tmp1], %[tmp1]\n\t" 4602 "vaesenclast %%xmm12, %[tmp2], %[tmp2]\n\t" 4603 "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" 4604 "vpxor 16(%[in]), %[tmp2], %[tmp2]\n\t" 4605 "vmovdqu %[tmp1], (%[out])\n\t" 4606 "vmovdqu %[tmp2], 16(%[out])\n\t" 4607 "vaesenclast %%xmm12, %[tmp3], %[tmp3]\n\t" 4608 "vaesenclast %%xmm12, %[tmp4], %[tmp4]\n\t" 4609 "vpxor 32(%[in]), %[tmp3], %[tmp3]\n\t" 4610 "vpxor 48(%[in]), %[tmp4], %[tmp4]\n\t" 4611 "vmovdqu %[tmp3], 32(%[out])\n\t" 4612 "vmovdqu %[tmp4], 48(%[out])\n\t" 4613 "vaesenclast %%xmm12, %[tmp5], %[tmp5]\n\t" 4614 "vaesenclast %%xmm12, %[tmp6], %[tmp6]\n\t" 4615 "vpxor 64(%[in]), %[tmp5], %[tmp5]\n\t" 4616 "vpxor 80(%[in]), %[tmp6], %[tmp6]\n\t" 4617 "vmovdqu %[tmp5], 64(%[out])\n\t" 4618 "vmovdqu %[tmp6], 80(%[out])\n\t" 4619 "vaesenclast %%xmm12, %[tmp7], %[tmp7]\n\t" 4620 "vaesenclast %%xmm12, %[tmp8], %[tmp8]\n\t" 4621 "vpxor 96(%[in]), %[tmp7], %[tmp7]\n\t" 4622 "vpxor 112(%[in]), %[tmp8], %[tmp8]\n\t" 4623 "vmovdqu %[tmp7], 96(%[out])\n\t" 4624 "vmovdqu %[tmp8], 112(%[out])\n\t" 4625 4626 : [tmp1] "=xr" (tmp1), [tmp2] "=xr" (tmp2), [tmp3] "=xr" (tmp3), 4627 [tmp4] "=xr" (tmp4), [tmp5] "=xr" (tmp5), [tmp6] "=xr" (tmp6), 4628 [tmp7] "=xr" (tmp7), [tmp8] "=xr" (tmp8) 4629 : [KEY] "r" (KEY), [pctr1] "r" (pctr1), 4630 [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), 4631 [BSWAP_EPI64] "m" (BSWAP_EPI64), 4632 [ONE] "m" (ONE), [TWO] "m" (TWO), 4633 [THREE] "m" (THREE), [FOUR] "m" (FOUR), 4634 [FIVE] "m" (FIVE), [SIX] "m" (SIX), 4635 [SEVEN] "m" (SEVEN), [EIGHT] "m" (EIGHT) 4636 : "xmm15", "xmm14", "xmm13", "xmm12", 4637 "xmm0", "xmm1", "xmm3", "memory" 4638 ); 4639 4640 XV = X; 4641 for (i=1; i < (int)(nbytes/16/8); i++) { 4642 __asm__ __volatile__ ( 4643 "vmovaps (%[pctr1]), %%xmm0\n\t" 4644 "vmovaps %[BSWAP_EPI64], %%xmm1\n\t" 4645 "vpshufb %%xmm1, %%xmm0, %[tmp1]\n\t" 4646 "vpaddd %[ONE], %%xmm0, %[tmp2]\n\t" 4647 "vpshufb %%xmm1, %[tmp2], %[tmp2]\n\t" 4648 "vpaddd %[TWO], %%xmm0, %[tmp3]\n\t" 4649 "vpshufb %%xmm1, %[tmp3], %[tmp3]\n\t" 4650 "vpaddd %[THREE], %%xmm0, %[tmp4]\n\t" 4651 "vpshufb %%xmm1, %[tmp4], %[tmp4]\n\t" 4652 "vpaddd %[FOUR], %%xmm0, %[tmp5]\n\t" 4653 "vpshufb %%xmm1, %[tmp5], %[tmp5]\n\t" 4654 "vpaddd %[FIVE], %%xmm0, %[tmp6]\n\t" 4655 "vpshufb %%xmm1, %[tmp6], %[tmp6]\n\t" 4656 "vpaddd %[SIX], %%xmm0, %[tmp7]\n\t" 4657 "vpshufb %%xmm1, %[tmp7], %[tmp7]\n\t" 4658 "vpaddd %[SEVEN], %%xmm0, %[tmp8]\n\t" 4659 "vpshufb %%xmm1, %[tmp8], %[tmp8]\n\t" 4660 "vpaddd %[EIGHT], %%xmm0, %%xmm0\n\t" 4661 4662 "vmovaps (%[KEY]), %%xmm1\n\t" 4663 "vmovaps %%xmm0, (%[pctr1])\n\t" 4664 "vpxor %%xmm1, %[tmp1], %[tmp1]\n\t" 4665 "vpxor %%xmm1, %[tmp2], %[tmp2]\n\t" 4666 "vpxor %%xmm1, %[tmp3], %[tmp3]\n\t" 4667 "vpxor %%xmm1, %[tmp4], %[tmp4]\n\t" 4668 "vpxor %%xmm1, %[tmp5], %[tmp5]\n\t" 4669 "vpxor %%xmm1, %[tmp6], %[tmp6]\n\t" 4670 "vpxor %%xmm1, %[tmp7], %[tmp7]\n\t" 4671 "vpxor %%xmm1, %[tmp8], %[tmp8]\n\t" 4672 4673 "vmovaps 16(%[KEY]), %%xmm12\n\t" 4674 "vmovdqu -128(%[out]), %%xmm1\n\t" 4675 "vaesenc %%xmm12, %[tmp1], %[tmp1]\n\t" 4676 "vmovaps 112(%[HT]), %%xmm0\n\t" 4677 "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" 4678 "vpxor %[XV], %%xmm1, %%xmm1\n\t" 4679 "vaesenc %%xmm12, %[tmp2], %[tmp2]\n\t" 4680 "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" 4681 "vaesenc %%xmm12, %[tmp3], %[tmp3]\n\t" 4682 "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" 4683 "vaesenc %%xmm12, %[tmp4], %[tmp4]\n\t" 4684 "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" 4685 "vaesenc %%xmm12, %[tmp5], %[tmp5]\n\t" 4686 "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" 4687 "vaesenc %%xmm12, %[tmp6], %[tmp6]\n\t" 4688 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" 4689 "vpslldq $8, %%xmm13, %%xmm2\n\t" 4690 "vpsrldq $8, %%xmm13, %%xmm13\n\t" 4691 "vaesenc %%xmm12, %[tmp7], %[tmp7]\n\t" 4692 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" 4693 "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" 4694 "vaesenc %%xmm12, %[tmp8], %[tmp8]\n\t" 4695 4696 VAESENC_PCLMUL_N( 32, -112, 96) 4697 VAESENC_PCLMUL_N( 48, -96, 80) 4698 VAESENC_PCLMUL_N( 64, -80, 64) 4699 VAESENC_PCLMUL_N( 80, -64, 48) 4700 VAESENC_PCLMUL_N( 96, -48, 32) 4701 VAESENC_PCLMUL_N(112, -32, 16) 4702 VAESENC_PCLMUL_N(128, -16, 0) 4703 4704 "vmovaps 144(%[KEY]), %%xmm12\n\t" 4705 "vaesenc %%xmm12, %[tmp1], %[tmp1]\n\t" 4706 "vmovdqa %[MOD2_128], %%xmm0\n\t" 4707 "vaesenc %%xmm12, %[tmp2], %[tmp2]\n\t" 4708 "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" 4709 "vaesenc %%xmm12, %[tmp3], %[tmp3]\n\t" 4710 "vpshufd $78, %%xmm2, %%xmm13\n\t" 4711 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" 4712 "vaesenc %%xmm12, %[tmp4], %[tmp4]\n\t" 4713 "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" 4714 "vaesenc %%xmm12, %[tmp5], %[tmp5]\n\t" 4715 "vpshufd $78, %%xmm13, %%xmm13\n\t" 4716 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" 4717 "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" 4718 "vaesenc %%xmm12, %[tmp6], %[tmp6]\n\t" 4719 "vmovdqa %%xmm13, %%xmm2\n\t" 4720 "vaesenc %%xmm12, %[tmp7], %[tmp7]\n\t" 4721 "vaesenc %%xmm12, %[tmp8], %[tmp8]\n\t" 4722 "cmpl $11, %[nr]\n\t" 4723 "vmovaps 160(%[KEY]), %%xmm12\n\t" 4724 "jl %=f\n\t" 4725 4726 VAESENC() 4727 "vmovaps 176(%[KEY]), %%xmm12\n\t" 4728 VAESENC() 4729 "cmpl $13, %[nr]\n\t" 4730 "vmovaps 192(%[KEY]), %%xmm12\n\t" 4731 "jl %=f\n\t" 4732 4733 VAESENC() 4734 "vmovaps 208(%[KEY]), %%xmm12\n\t" 4735 VAESENC() 4736 "vmovaps 224(%[KEY]), %%xmm12\n\t" 4737 4738 "%=:\n\t" 4739 "vaesenclast %%xmm12, %[tmp1], %[tmp1]\n\t" 4740 "vaesenclast %%xmm12, %[tmp2], %[tmp2]\n\t" 4741 "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" 4742 "vpxor 16(%[in]), %[tmp2], %[tmp2]\n\t" 4743 "vmovdqu %[tmp1], (%[out])\n\t" 4744 "vmovdqu %[tmp2], 16(%[out])\n\t" 4745 "vaesenclast %%xmm12, %[tmp3], %[tmp3]\n\t" 4746 "vaesenclast %%xmm12, %[tmp4], %[tmp4]\n\t" 4747 "vpxor 32(%[in]), %[tmp3], %[tmp3]\n\t" 4748 "vpxor 48(%[in]), %[tmp4], %[tmp4]\n\t" 4749 "vmovdqu %[tmp3], 32(%[out])\n\t" 4750 "vmovdqu %[tmp4], 48(%[out])\n\t" 4751 "vaesenclast %%xmm12, %[tmp5], %[tmp5]\n\t" 4752 "vaesenclast %%xmm12, %[tmp6], %[tmp6]\n\t" 4753 "vpxor 64(%[in]), %[tmp5], %[tmp5]\n\t" 4754 "vpxor 80(%[in]), %[tmp6], %[tmp6]\n\t" 4755 "vmovdqu %[tmp5], 64(%[out])\n\t" 4756 "vmovdqu %[tmp6], 80(%[out])\n\t" 4757 "vaesenclast %%xmm12, %[tmp7], %[tmp7]\n\t" 4758 "vaesenclast %%xmm12, %[tmp8], %[tmp8]\n\t" 4759 "vpxor 96(%[in]), %[tmp7], %[tmp7]\n\t" 4760 "vpxor 112(%[in]), %[tmp8], %[tmp8]\n\t" 4761 "vmovdqu %[tmp7], 96(%[out])\n\t" 4762 "vmovdqu %[tmp8], 112(%[out])\n\t" 4763 4764 : [tmp1] "=xr" (tmp1), [tmp2] "=xr" (tmp2), [tmp3] "=xr" (tmp3), 4765 [tmp4] "=xr" (tmp4), [tmp5] "=xr" (tmp5), [tmp6] "=xr" (tmp6), 4766 [tmp7] "=xr" (tmp7), [tmp8] "=xr" (tmp8), 4767 [XV] "+xr" (XV) 4768 : [KEY] "r" (KEY), [HT] "r" (HT), [pctr1] "r" (pctr1), 4769 [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), 4770 [BSWAP_MASK] "m" (BSWAP_MASK), 4771 [BSWAP_EPI64] "m" (BSWAP_EPI64), 4772 [ONE] "m" (ONE), [TWO] "m" (TWO), 4773 [THREE] "m" (THREE), [FOUR] "m" (FOUR), 4774 [FIVE] "m" (FIVE), [SIX] "m" (SIX), 4775 [SEVEN] "m" (SEVEN), [EIGHT] "m" (EIGHT), 4776 [MOD2_128] "m" (MOD2_128) 4777 : "xmm15", "xmm14", "xmm13", "xmm12", 4778 "xmm0", "xmm1", "xmm3", "memory" 4779 ); 4780 } 4781 X = XV; 4782 ctr1 = pctr1[0]; 4783 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 4784 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 4785 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 4786 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 4787 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); 4788 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); 4789 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); 4790 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); 4791 tmp1 = _mm_xor_si128(X, tmp1); 4792 X = gfmul8(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, 4793 HT[0], HT[1], HT[2], HT[3], HT[4], HT[5], HT[6], HT[7]); 4794 } 4795 for (k = i*8; k < (int)(nbytes/16); k++) { 4796 __asm__ __volatile__ ( 4797 "vpshufb %[BSWAP_EPI64], %[ctr1], %[tmp1]\n\t" 4798 "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" 4799 "vpxor (%[KEY]), %[tmp1], %[tmp1]\n\t" 4800 "vaesenc 16(%[KEY]), %[tmp1], %[tmp1]\n\t" 4801 "vaesenc 32(%[KEY]), %[tmp1], %[tmp1]\n\t" 4802 "vaesenc 48(%[KEY]), %[tmp1], %[tmp1]\n\t" 4803 "vaesenc 64(%[KEY]), %[tmp1], %[tmp1]\n\t" 4804 "vaesenc 80(%[KEY]), %[tmp1], %[tmp1]\n\t" 4805 "vaesenc 96(%[KEY]), %[tmp1], %[tmp1]\n\t" 4806 "vaesenc 112(%[KEY]), %[tmp1], %[tmp1]\n\t" 4807 "vaesenc 128(%[KEY]), %[tmp1], %[tmp1]\n\t" 4808 "vaesenc 144(%[KEY]), %[tmp1], %[tmp1]\n\t" 4809 "cmpl $11, %[nr]\n\t" 4810 "vmovaps 160(%[KEY]), %[tmp2]\n\t" 4811 "jl %=f\n\t" 4812 "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" 4813 "vaesenc 176(%[KEY]), %[tmp1], %[tmp1]\n\t" 4814 "cmpl $13, %[nr]\n\t" 4815 "vmovaps 192(%[KEY]), %[tmp2]\n\t" 4816 "jl %=f\n\t" 4817 "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" 4818 "vaesenc 208(%[KEY]), %[tmp1], %[tmp1]\n\t" 4819 "vmovaps 224(%[KEY]), %[tmp2]\n\t" 4820 "%=:\n\t" 4821 "vaesenclast %[tmp2], %[tmp1], %[tmp1]\n\t" 4822 "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" 4823 "vmovdqu %[tmp1], (%[out])\n\t" 4824 "vpshufb %[BSWAP_MASK], %[tmp1], %[tmp1]\n\t" 4825 4826 "vpxor %[tmp1], %[X], %[X]\n\t" 4827 "# Carryless Multiply X by H (128 x 128)\n\t" 4828 "vpclmulqdq $16, %[H], %[X], %%xmm13\n\t" 4829 "vpclmulqdq $1, %[H], %[X], %%xmm14\n\t" 4830 "vpclmulqdq $0, %[H], %[X], %%xmm15\n\t" 4831 "vpclmulqdq $17, %[H], %[X], %%xmm1\n\t" 4832 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" 4833 "vpslldq $8, %%xmm13, %%xmm2\n\t" 4834 "vpsrldq $8, %%xmm13, %%xmm13\n\t" 4835 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" 4836 "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" 4837 "# Reduce\n\t" 4838 "vmovdqa %[MOD2_128], %%xmm0\n\t" 4839 "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" 4840 "vpshufd $78, %%xmm2, %%xmm13\n\t" 4841 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" 4842 "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" 4843 "vpshufd $78, %%xmm13, %%xmm13\n\t" 4844 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" 4845 "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" 4846 "vmovdqa %%xmm13, %[X]\n\t" 4847 "# End Reduce\n\t" 4848 4849 : [tmp1] "+xr" (tmp1), [tmp2] "=xr" (tmp2), 4850 [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) 4851 : [KEY] "r" (KEY), 4852 [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), 4853 [BSWAP_MASK] "m" (BSWAP_MASK), 4854 [BSWAP_EPI64] "m" (BSWAP_EPI64), 4855 [ONE] "m" (ONE), 4856 [MOD2_128] "m" (MOD2_128) 4857 : "xmm15", "xmm14", "xmm13", 4858 "xmm0", "xmm1", "xmm2", "xmm3", "memory" 4859 ); 4860 } 4861 #else 4862 for (k = 0; k < (int)(nbytes/16) && k < 1; k++) { 4863 __asm__ __volatile__ ( 4864 "vpshufb %[BSWAP_EPI64], %[ctr1], %[tmp1]\n\t" 4865 "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" 4866 "vpxor (%[KEY]), %[tmp1], %[tmp1]\n\t" 4867 "vaesenc 16(%[KEY]), %[tmp1], %[tmp1]\n\t" 4868 "vaesenc 32(%[KEY]), %[tmp1], %[tmp1]\n\t" 4869 "vaesenc 48(%[KEY]), %[tmp1], %[tmp1]\n\t" 4870 "vaesenc 64(%[KEY]), %[tmp1], %[tmp1]\n\t" 4871 "vaesenc 80(%[KEY]), %[tmp1], %[tmp1]\n\t" 4872 "vaesenc 96(%[KEY]), %[tmp1], %[tmp1]\n\t" 4873 "vaesenc 112(%[KEY]), %[tmp1], %[tmp1]\n\t" 4874 "vaesenc 128(%[KEY]), %[tmp1], %[tmp1]\n\t" 4875 "vaesenc 144(%[KEY]), %[tmp1], %[tmp1]\n\t" 4876 "cmpl $11, %[nr]\n\t" 4877 "vmovaps 160(%[KEY]), %[tmp2]\n\t" 4878 "jl %=f\n\t" 4879 "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" 4880 "vaesenc 176(%[KEY]), %[tmp1], %[tmp1]\n\t" 4881 "cmpl $13, %[nr]\n\t" 4882 "vmovaps 192(%[KEY]), %[tmp2]\n\t" 4883 "jl %=f\n\t" 4884 "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" 4885 "vaesenc 208(%[KEY]), %[tmp1], %[tmp1]\n\t" 4886 "vmovaps 224(%[KEY]), %[tmp2]\n\t" 4887 "%=:\n\t" 4888 "vaesenclast %[tmp2], %[tmp1], %[tmp1]\n\t" 4889 "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" 4890 "vmovdqu %[tmp1], (%[out])\n\t" 4891 "vpshufb %[BSWAP_MASK], %[tmp1], %[tmp1]\n\t" 4892 "vpxor %[tmp1], %[X], %[X]\n\t" 4893 4894 : [tmp1] "+xr" (tmp1), [tmp2] "=xr" (tmp2), 4895 [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) 4896 : [KEY] "r" (KEY), 4897 [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), 4898 [BSWAP_MASK] "m" (BSWAP_MASK), 4899 [BSWAP_EPI64] "m" (BSWAP_EPI64), 4900 [ONE] "m" (ONE), 4901 [MOD2_128] "m" (MOD2_128) 4902 : "memory" 4903 ); 4904 } 4905 for (; k < (int)(nbytes/16); k++) { 4906 __asm__ __volatile__ ( 4907 "vpshufb %[BSWAP_EPI64], %[ctr1], %[tmp1]\n\t" 4908 "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" 4909 "vpxor (%[KEY]), %[tmp1], %[tmp1]\n\t" 4910 "vaesenc 16(%[KEY]), %[tmp1], %[tmp1]\n\t" 4911 "vpclmulqdq $16, %[H], %[X], %%xmm13\n\t" 4912 "vaesenc 32(%[KEY]), %[tmp1], %[tmp1]\n\t" 4913 "vpclmulqdq $1, %[H], %[X], %%xmm14\n\t" 4914 "vaesenc 48(%[KEY]), %[tmp1], %[tmp1]\n\t" 4915 "vpclmulqdq $0, %[H], %[X], %%xmm15\n\t" 4916 "vaesenc 64(%[KEY]), %[tmp1], %[tmp1]\n\t" 4917 "vpclmulqdq $17, %[H], %[X], %%xmm1\n\t" 4918 "vaesenc 80(%[KEY]), %[tmp1], %[tmp1]\n\t" 4919 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" 4920 "vpslldq $8, %%xmm13, %%xmm2\n\t" 4921 "vpsrldq $8, %%xmm13, %%xmm13\n\t" 4922 "vaesenc 96(%[KEY]), %[tmp1], %[tmp1]\n\t" 4923 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" 4924 "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" 4925 "vmovdqa %[MOD2_128], %%xmm0\n\t" 4926 "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" 4927 "vaesenc 112(%[KEY]), %[tmp1], %[tmp1]\n\t" 4928 "vpshufd $78, %%xmm2, %%xmm13\n\t" 4929 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" 4930 "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" 4931 "vaesenc 128(%[KEY]), %[tmp1], %[tmp1]\n\t" 4932 "vpshufd $78, %%xmm13, %%xmm13\n\t" 4933 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" 4934 "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" 4935 "vaesenc 144(%[KEY]), %[tmp1], %[tmp1]\n\t" 4936 "vmovdqa %%xmm13, %[X]\n\t" 4937 "cmpl $11, %[nr]\n\t" 4938 "vmovaps 160(%[KEY]), %[tmp2]\n\t" 4939 "jl %=f\n\t" 4940 "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" 4941 "vaesenc 176(%[KEY]), %[tmp1], %[tmp1]\n\t" 4942 "cmpl $13, %[nr]\n\t" 4943 "vmovaps 192(%[KEY]), %[tmp2]\n\t" 4944 "jl %=f\n\t" 4945 "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" 4946 "vaesenc 208(%[KEY]), %[tmp1], %[tmp1]\n\t" 4947 "vmovaps 224(%[KEY]), %[tmp2]\n\t" 4948 "%=:\n\t" 4949 "vaesenclast %[tmp2], %[tmp1], %[tmp1]\n\t" 4950 "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" 4951 "vmovdqu %[tmp1], (%[out])\n\t" 4952 "vpshufb %[BSWAP_MASK], %[tmp1], %[tmp1]\n\t" 4953 "vpxor %[tmp1], %[X], %[X]\n\t" 4954 4955 : [tmp1] "+xr" (tmp1), [tmp2] "=xr" (tmp2), 4956 [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) 4957 : [KEY] "r" (KEY), 4958 [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), 4959 [BSWAP_MASK] "m" (BSWAP_MASK), 4960 [BSWAP_EPI64] "m" (BSWAP_EPI64), 4961 [ONE] "m" (ONE), 4962 [MOD2_128] "m" (MOD2_128) 4963 : "xmm15", "xmm14", "xmm13", 4964 "xmm0", "xmm1", "xmm2", "xmm3", "memory" 4965 ); 4966 } 4967 if (k > 0) { 4968 X = gfmul_shifted(X, H); 4969 } 4970 #endif 4971 /* If one partial block remains */ 4972 if (nbytes % 16) { 4973 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 4974 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 4975 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); 4976 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); 4977 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); 4978 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); 4979 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); 4980 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); 4981 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); 4982 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); 4983 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); 4984 lastKey = KEY[10]; 4985 if (nr > 10) { 4986 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 4987 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); 4988 lastKey = KEY[12]; 4989 if (nr > 12) { 4990 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 4991 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); 4992 lastKey = KEY[14]; 4993 } 4994 } 4995 tmp1 = _mm_aesenclast_si128(tmp1, lastKey); 4996 last_block = tmp1; 4997 for (j=0; j < (int)(nbytes%16); j++) 4998 ((unsigned char*)&last_block)[j] = in[k*16+j]; 4999 tmp1 = _mm_xor_si128(tmp1, last_block); 5000 last_block = tmp1; 5001 for (j=0; j < (int)(nbytes%16); j++) 5002 out[k*16+j] = ((unsigned char*)&last_block)[j]; 5003 tmp1 = last_block; 5004 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 5005 X =_mm_xor_si128(X, tmp1); 5006 X = gfmul_shifted(X, H); 5007 } 5008 tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); 5009 tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); 5010 X = _mm_xor_si128(X, tmp1); 5011 X = gfmul_shifted(X, H); 5012 X = _mm_shuffle_epi8(X, BSWAP_MASK); 5013 T = _mm_xor_si128(X, T); 5014 _mm_storeu_si128((__m128i*)tag, T); 5015 } 5016 #endif /* HAVE_INTEL_AVX2 */ 7297 /*_mm_storeu_si128((__m128i*)tag, T);*/ 7298 XMEMCPY(tag, &T, tbytes); 7299 } 5017 7300 5018 7301 #ifdef HAVE_AES_DECRYPT 5019 /* Figure 10. AES-GCM – Decrypt With Single Block Ghash at a Time */ 5020 5021 static int AES_GCM_decrypt(const unsigned char *in, unsigned char *out, 5022 const unsigned char* addt, const unsigned char* ivec, 7302 7303 static void AES_GCM_decrypt(const unsigned char *in, 7304 unsigned char *out, 7305 const unsigned char* addt, 7306 const unsigned char* ivec, 5023 7307 const unsigned char *tag, int nbytes, int abytes, 5024 int ibytes, const unsigned char* key, int nr) 7308 int ibytes, word32 tbytes, const unsigned char* key, 7309 int nr, int* res) 5025 7310 { 5026 7311 int i, j ,k; … … 5035 7320 __m128i r0, r1; 5036 7321 __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; 5037 #endif 5038 5039 if (ibytes == 12) { 5040 Y = _mm_setzero_si128(); 5041 for (j=0; j < 12; j++) 5042 ((unsigned char*)&Y)[j] = ivec[j]; 5043 Y = _mm_insert_epi32(Y, 0x1000000, 3); 5044 /* (Compute E[ZERO, KS] and E[Y0, KS] together */ 5045 tmp1 = _mm_xor_si128(X, KEY[0]); 5046 tmp2 = _mm_xor_si128(Y, KEY[0]); 5047 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); 5048 tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); 5049 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); 5050 tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); 5051 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); 5052 tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); 5053 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); 5054 tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); 5055 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); 5056 tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); 5057 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); 5058 tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); 5059 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); 5060 tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); 5061 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); 5062 tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); 5063 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); 5064 tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); 5065 lastKey = KEY[10]; 5066 if (nr > 10) { 5067 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 5068 tmp2 = _mm_aesenc_si128(tmp2, lastKey); 5069 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); 5070 tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); 5071 lastKey = KEY[12]; 5072 if (nr > 12) { 5073 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 5074 tmp2 = _mm_aesenc_si128(tmp2, lastKey); 5075 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); 5076 tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); 5077 lastKey = KEY[14]; 5078 } 5079 } 5080 H = _mm_aesenclast_si128(tmp1, lastKey); 5081 T = _mm_aesenclast_si128(tmp2, lastKey); 5082 H = _mm_shuffle_epi8(H, BSWAP_MASK); 5083 } 5084 else { 5085 if (ibytes % 16) { 5086 i = ibytes / 16; 5087 for (j=0; j < ibytes%16; j++) 5088 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 5089 } 5090 tmp1 = _mm_xor_si128(X, KEY[0]); 5091 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); 5092 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); 5093 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); 5094 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); 5095 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); 5096 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); 5097 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); 5098 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); 5099 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); 5100 lastKey = KEY[10]; 5101 if (nr > 10) { 5102 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 5103 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); 5104 lastKey = KEY[12]; 5105 if (nr > 12) { 5106 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 5107 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); 5108 lastKey = KEY[14]; 5109 } 5110 } 5111 H = _mm_aesenclast_si128(tmp1, lastKey); 5112 H = _mm_shuffle_epi8(H, BSWAP_MASK); 5113 5114 Y = _mm_setzero_si128(); 5115 for (i=0; i < ibytes/16; i++) { 5116 tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); 5117 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 5118 Y = _mm_xor_si128(Y, tmp1); 5119 Y = gfmul_sw(Y, H); 5120 } 5121 if (ibytes % 16) { 5122 tmp1 = last_block; 5123 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 5124 Y = _mm_xor_si128(Y, tmp1); 5125 Y = gfmul_sw(Y, H); 5126 } 5127 tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); 5128 tmp1 = _mm_insert_epi64(tmp1, 0, 1); 5129 Y = _mm_xor_si128(Y, tmp1); 5130 Y = gfmul_sw(Y, H); 5131 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ 5132 tmp1 = _mm_xor_si128(Y, KEY[0]); 5133 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); 5134 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); 5135 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); 5136 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); 5137 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); 5138 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); 5139 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); 5140 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); 5141 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); 5142 lastKey = KEY[10]; 5143 if (nr > 10) { 5144 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 5145 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); 5146 lastKey = KEY[12]; 5147 if (nr > 12) { 5148 tmp1 = _mm_aesenc_si128(tmp1, lastKey); 5149 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); 5150 lastKey = KEY[14]; 5151 } 5152 } 5153 T = _mm_aesenclast_si128(tmp1, lastKey); 5154 } 7322 #endif /* AES_GCM_AESNI_NO_UNROLL */ 7323 7324 if (ibytes == 12) 7325 aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T); 7326 else 7327 aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T); 5155 7328 5156 7329 for (i=0; i<abytes/16; i++) { … … 5359 7532 tmp8 = _mm_aesenc_si128(tmp8, KEY[13]); 5360 7533 lastKey = KEY[14]; 5361 } 5362 } 5363 tmp1 =_mm_aesenclast_si128(tmp1, lastKey); 5364 tmp2 =_mm_aesenclast_si128(tmp2, lastKey); 5365 tmp3 =_mm_aesenclast_si128(tmp3, lastKey); 5366 tmp4 =_mm_aesenclast_si128(tmp4, lastKey); 5367 tmp5 =_mm_aesenclast_si128(tmp5, lastKey); 5368 tmp6 =_mm_aesenclast_si128(tmp6, lastKey); 5369 tmp7 =_mm_aesenclast_si128(tmp7, lastKey); 5370 tmp8 =_mm_aesenclast_si128(tmp8, lastKey); 5371 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*8+0])); 5372 tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*8+1])); 5373 tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*8+2])); 5374 tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*8+3])); 5375 tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[i*8+4])); 5376 tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[i*8+5])); 5377 tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[i*8+6])); 5378 tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[i*8+7])); 5379 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 5380 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 5381 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 5382 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 5383 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 5384 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 5385 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 5386 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 5387 } 5388 } 5389 #endif 7534 } 7535 } 7536 AES_ENC_LAST_8(); 7537 } 7538 } 7539 7540 #endif /* AES_GCM_AESNI_NO_UNROLL */ 7541 5390 7542 for (k = i*8; k < nbytes/16; k++) { 5391 7543 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); … … 5469 7621 T = _mm_xor_si128(X, T); 5470 7622 5471 if (0xffff != 5472 _mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag)))) 5473 return 0; /* in case the authentication failed */ 5474 5475 return 1; /* when successful returns 1 */ 7623 /* if (0xffff != 7624 _mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag)))) */ 7625 if (XMEMCMP(tag, &T, tbytes) != 0) 7626 *res = 0; /* in case the authentication failed */ 7627 else 7628 *res = 1; /* when successful returns 1 */ 5476 7629 } 5477 7630 5478 #ifdef HAVE_INTEL_AVX25479 static int AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out,5480 const unsigned char* addt,5481 const unsigned char* ivec,5482 const unsigned char *tag, int nbytes,5483 int abytes, int ibytes,5484 const unsigned char* key, int nr)5485 {5486 int i, j ,k;5487 __m128i H, Y, T;5488 __m128i *KEY = (__m128i*)key, lastKey;5489 __m128i ctr1;5490 __m128i last_block = _mm_setzero_si128();5491 __m128i X = _mm_setzero_si128();5492 __m128i tmp1, tmp2;5493 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL)5494 __m128i HT[8];5495 __m128i pctr1[1];5496 register __m128i XV asm("xmm2");5497 #else5498 __m128i XV;5499 #endif5500 5501 if (ibytes == 12) {5502 Y = _mm_setzero_si128();5503 for (j=0; j < 12; j++)5504 ((unsigned char*)&Y)[j] = ivec[j];5505 Y = _mm_insert_epi32(Y, 0x1000000, 3);5506 /* (Compute E[ZERO, KS] and E[Y0, KS] together */5507 tmp1 = _mm_xor_si128(X, KEY[0]);5508 tmp2 = _mm_xor_si128(Y, KEY[0]);5509 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]);5510 tmp2 = _mm_aesenc_si128(tmp2, KEY[1]);5511 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]);5512 tmp2 = _mm_aesenc_si128(tmp2, KEY[2]);5513 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]);5514 tmp2 = _mm_aesenc_si128(tmp2, KEY[3]);5515 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]);5516 tmp2 = _mm_aesenc_si128(tmp2, KEY[4]);5517 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]);5518 tmp2 = _mm_aesenc_si128(tmp2, KEY[5]);5519 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]);5520 tmp2 = _mm_aesenc_si128(tmp2, KEY[6]);5521 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]);5522 tmp2 = _mm_aesenc_si128(tmp2, KEY[7]);5523 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]);5524 tmp2 = _mm_aesenc_si128(tmp2, KEY[8]);5525 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]);5526 tmp2 = _mm_aesenc_si128(tmp2, KEY[9]);5527 lastKey = KEY[10];5528 if (nr > 10) {5529 tmp1 = _mm_aesenc_si128(tmp1, lastKey);5530 tmp2 = _mm_aesenc_si128(tmp2, lastKey);5531 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]);5532 tmp2 = _mm_aesenc_si128(tmp2, KEY[11]);5533 lastKey = KEY[12];5534 if (nr > 12) {5535 tmp1 = _mm_aesenc_si128(tmp1, lastKey);5536 tmp2 = _mm_aesenc_si128(tmp2, lastKey);5537 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]);5538 tmp2 = _mm_aesenc_si128(tmp2, KEY[13]);5539 lastKey = KEY[14];5540 }5541 }5542 H = _mm_aesenclast_si128(tmp1, lastKey);5543 T = _mm_aesenclast_si128(tmp2, lastKey);5544 H = _mm_shuffle_epi8(H, BSWAP_MASK);5545 }5546 else {5547 if (ibytes % 16) {5548 i = ibytes / 16;5549 for (j=0; j < ibytes%16; j++)5550 ((unsigned char*)&last_block)[j] = ivec[i*16+j];5551 }5552 tmp1 = _mm_xor_si128(X, KEY[0]);5553 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]);5554 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]);5555 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]);5556 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]);5557 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]);5558 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]);5559 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]);5560 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]);5561 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]);5562 lastKey = KEY[10];5563 if (nr > 10) {5564 tmp1 = _mm_aesenc_si128(tmp1, lastKey);5565 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]);5566 lastKey = KEY[12];5567 if (nr > 12) {5568 tmp1 = _mm_aesenc_si128(tmp1, lastKey);5569 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]);5570 lastKey = KEY[14];5571 }5572 }5573 H = _mm_aesenclast_si128(tmp1, lastKey);5574 H = _mm_shuffle_epi8(H, BSWAP_MASK);5575 5576 Y = _mm_setzero_si128();5577 for (i=0; i < ibytes/16; i++) {5578 tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);5579 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);5580 Y = _mm_xor_si128(Y, tmp1);5581 Y = gfmul_sw(Y, H);5582 }5583 if (ibytes % 16) {5584 tmp1 = last_block;5585 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);5586 Y = _mm_xor_si128(Y, tmp1);5587 Y = gfmul_sw(Y, H);5588 }5589 tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0);5590 tmp1 = _mm_insert_epi64(tmp1, 0, 1);5591 Y = _mm_xor_si128(Y, tmp1);5592 Y = gfmul_sw(Y, H);5593 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */5594 tmp1 = _mm_xor_si128(Y, KEY[0]);5595 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]);5596 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]);5597 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]);5598 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]);5599 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]);5600 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]);5601 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]);5602 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]);5603 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]);5604 lastKey = KEY[10];5605 if (nr > 10) {5606 tmp1 = _mm_aesenc_si128(tmp1, lastKey);5607 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]);5608 lastKey = KEY[12];5609 if (nr > 12) {5610 tmp1 = _mm_aesenc_si128(tmp1, lastKey);5611 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]);5612 lastKey = KEY[14];5613 }5614 }5615 T = _mm_aesenclast_si128(tmp1, lastKey);5616 }5617 5618 for (i=0; i<abytes/16; i++) {5619 tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);5620 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);5621 X = _mm_xor_si128(X, tmp1);5622 X = gfmul_sw(X, H);5623 }5624 if (abytes%16) {5625 last_block = _mm_setzero_si128();5626 for (j=0; j<abytes%16; j++)5627 ((unsigned char*)&last_block)[j] = addt[i*16+j];5628 tmp1 = last_block;5629 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);5630 X = _mm_xor_si128(X, tmp1);5631 X = gfmul_sw(X, H);5632 }5633 5634 tmp1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);5635 ctr1 = _mm_add_epi32(tmp1, ONE);5636 H = gfmul_shl1(H);5637 i = 0;5638 5639 #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL)5640 5641 if (0 < nbytes/16/8) {5642 HT[0] = H;5643 HT[1] = gfmul_shifted(H, H);5644 HT[2] = gfmul_shifted(H, HT[1]);5645 HT[3] = gfmul_shifted(HT[1], HT[1]);5646 HT[4] = gfmul_shifted(HT[1], HT[2]);5647 HT[5] = gfmul_shifted(HT[2], HT[2]);5648 HT[6] = gfmul_shifted(HT[2], HT[3]);5649 HT[7] = gfmul_shifted(HT[3], HT[3]);5650 5651 pctr1[0] = ctr1;5652 XV = X;5653 for (; i < nbytes/16/8; i++) {5654 __asm__ __volatile__ (5655 "vmovaps (%[pctr1]), %%xmm0\n\t"5656 "vmovaps %[BSWAP_EPI64], %%xmm1\n\t"5657 "vpshufb %%xmm1, %%xmm0, %%xmm4\n\t"5658 "vpaddd %[ONE], %%xmm0, %%xmm5\n\t"5659 "vpshufb %%xmm1, %%xmm5, %%xmm5\n\t"5660 "vpaddd %[TWO], %%xmm0, %%xmm6\n\t"5661 "vpshufb %%xmm1, %%xmm6, %%xmm6\n\t"5662 "vpaddd %[THREE], %%xmm0, %%xmm7\n\t"5663 "vpshufb %%xmm1, %%xmm7, %%xmm7\n\t"5664 "vpaddd %[FOUR], %%xmm0, %%xmm8\n\t"5665 "vpshufb %%xmm1, %%xmm8, %%xmm8\n\t"5666 "vpaddd %[FIVE], %%xmm0, %%xmm9\n\t"5667 "vpshufb %%xmm1, %%xmm9, %%xmm9\n\t"5668 "vpaddd %[SIX], %%xmm0, %%xmm10\n\t"5669 "vpshufb %%xmm1, %%xmm10, %%xmm10\n\t"5670 "vpaddd %[SEVEN], %%xmm0, %%xmm11\n\t"5671 "vpshufb %%xmm1, %%xmm11, %%xmm11\n\t"5672 "vpaddd %[EIGHT], %%xmm0, %%xmm0\n\t"5673 5674 "vmovaps (%[KEY]), %%xmm1\n\t"5675 "vmovaps %%xmm0, (%[pctr1])\n\t"5676 "vpxor %%xmm1, %%xmm4, %%xmm4\n\t"5677 "vpxor %%xmm1, %%xmm5, %%xmm5\n\t"5678 "vpxor %%xmm1, %%xmm6, %%xmm6\n\t"5679 "vpxor %%xmm1, %%xmm7, %%xmm7\n\t"5680 "vpxor %%xmm1, %%xmm8, %%xmm8\n\t"5681 "vpxor %%xmm1, %%xmm9, %%xmm9\n\t"5682 "vpxor %%xmm1, %%xmm10, %%xmm10\n\t"5683 "vpxor %%xmm1, %%xmm11, %%xmm11\n\t"5684 5685 "vmovaps 16(%[KEY]), %%xmm12\n\t"5686 "vmovdqu (%[in]), %%xmm1\n\t"5687 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t"5688 "vmovaps 112(%[HT]), %%xmm0\n\t"5689 "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t"5690 "vpxor %[XV], %%xmm1, %%xmm1\n\t"5691 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t"5692 "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t"5693 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t"5694 "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t"5695 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t"5696 "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t"5697 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t"5698 "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t"5699 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t"5700 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t"5701 "vpslldq $8, %%xmm13, %%xmm2\n\t"5702 "vpsrldq $8, %%xmm13, %%xmm13\n\t"5703 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t"5704 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t"5705 "vpxor %%xmm13, %%xmm1, %%xmm3\n\t"5706 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t"5707 5708 "vmovaps 32(%[KEY]), %%xmm12\n\t"5709 "vmovdqu 16(%[in]), %%xmm1\n\t"5710 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t"5711 "vmovaps 96(%[HT]), %%xmm0\n\t"5712 "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t"5713 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t"5714 "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t"5715 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t"5716 "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t"5717 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t"5718 "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t"5719 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t"5720 "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t"5721 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t"5722 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t"5723 "vpslldq $8, %%xmm13, %%xmm14\n\t"5724 "vpsrldq $8, %%xmm13, %%xmm13\n\t"5725 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t"5726 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t"5727 "vpxor %%xmm1, %%xmm3, %%xmm3\n\t"5728 "vpxor %%xmm14, %%xmm2, %%xmm2\n\t"5729 "vpxor %%xmm13, %%xmm3, %%xmm3\n\t"5730 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t"5731 5732 "vmovaps 48(%[KEY]), %%xmm12\n\t"5733 "vmovdqu 32(%[in]), %%xmm1\n\t"5734 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t"5735 "vmovaps 80(%[HT]), %%xmm0\n\t"5736 "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t"5737 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t"5738 "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t"5739 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t"5740 "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t"5741 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t"5742 "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t"5743 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t"5744 "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t"5745 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t"5746 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t"5747 "vpslldq $8, %%xmm13, %%xmm14\n\t"5748 "vpsrldq $8, %%xmm13, %%xmm13\n\t"5749 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t"5750 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t"5751 "vpxor %%xmm1, %%xmm3, %%xmm3\n\t"5752 "vpxor %%xmm14, %%xmm2, %%xmm2\n\t"5753 "vpxor %%xmm13, %%xmm3, %%xmm3\n\t"5754 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t"5755 5756 "vmovaps 64(%[KEY]), %%xmm12\n\t"5757 "vmovdqu 48(%[in]), %%xmm1\n\t"5758 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t"5759 "vmovaps 64(%[HT]), %%xmm0\n\t"5760 "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t"5761 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t"5762 "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t"5763 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t"5764 "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t"5765 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t"5766 "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t"5767 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t"5768 "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t"5769 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t"5770 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t"5771 "vpslldq $8, %%xmm13, %%xmm14\n\t"5772 "vpsrldq $8, %%xmm13, %%xmm13\n\t"5773 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t"5774 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t"5775 "vpxor %%xmm1, %%xmm3, %%xmm3\n\t"5776 "vpxor %%xmm14, %%xmm2, %%xmm2\n\t"5777 "vpxor %%xmm13, %%xmm3, %%xmm3\n\t"5778 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t"5779 5780 "vmovaps 80(%[KEY]), %%xmm12\n\t"5781 "vmovdqu 64(%[in]), %%xmm1\n\t"5782 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t"5783 "vmovaps 48(%[HT]), %%xmm0\n\t"5784 "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t"5785 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t"5786 "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t"5787 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t"5788 "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t"5789 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t"5790 "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t"5791 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t"5792 "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t"5793 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t"5794 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t"5795 "vpslldq $8, %%xmm13, %%xmm14\n\t"5796 "vpsrldq $8, %%xmm13, %%xmm13\n\t"5797 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t"5798 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t"5799 "vpxor %%xmm1, %%xmm3, %%xmm3\n\t"5800 "vpxor %%xmm14, %%xmm2, %%xmm2\n\t"5801 "vpxor %%xmm13, %%xmm3, %%xmm3\n\t"5802 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t"5803 5804 "vmovaps 96(%[KEY]), %%xmm12\n\t"5805 "vmovdqu 80(%[in]), %%xmm1\n\t"5806 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t"5807 "vmovaps 32(%[HT]), %%xmm0\n\t"5808 "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t"5809 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t"5810 "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t"5811 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t"5812 "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t"5813 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t"5814 "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t"5815 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t"5816 "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t"5817 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t"5818 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t"5819 "vpslldq $8, %%xmm13, %%xmm14\n\t"5820 "vpsrldq $8, %%xmm13, %%xmm13\n\t"5821 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t"5822 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t"5823 "vpxor %%xmm1, %%xmm3, %%xmm3\n\t"5824 "vpxor %%xmm14, %%xmm2, %%xmm2\n\t"5825 "vpxor %%xmm13, %%xmm3, %%xmm3\n\t"5826 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t"5827 5828 "vmovaps 112(%[KEY]), %%xmm12\n\t"5829 "vmovdqu 96(%[in]), %%xmm1\n\t"5830 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t"5831 "vmovaps 16(%[HT]), %%xmm0\n\t"5832 "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t"5833 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t"5834 "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t"5835 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t"5836 "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t"5837 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t"5838 "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t"5839 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t"5840 "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t"5841 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t"5842 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t"5843 "vpslldq $8, %%xmm13, %%xmm14\n\t"5844 "vpsrldq $8, %%xmm13, %%xmm13\n\t"5845 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t"5846 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t"5847 "vpxor %%xmm1, %%xmm3, %%xmm3\n\t"5848 "vpxor %%xmm14, %%xmm2, %%xmm2\n\t"5849 "vpxor %%xmm13, %%xmm3, %%xmm3\n\t"5850 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t"5851 5852 "vmovaps 128(%[KEY]), %%xmm12\n\t"5853 "vmovdqu 112(%[in]), %%xmm1\n\t"5854 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t"5855 "vmovaps (%[HT]), %%xmm0\n\t"5856 "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t"5857 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t"5858 "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t"5859 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t"5860 "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t"5861 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t"5862 "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t"5863 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t"5864 "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t"5865 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t"5866 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t"5867 "vpslldq $8, %%xmm13, %%xmm14\n\t"5868 "vpsrldq $8, %%xmm13, %%xmm13\n\t"5869 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t"5870 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t"5871 "vpxor %%xmm1, %%xmm3, %%xmm3\n\t"5872 "vpxor %%xmm14, %%xmm2, %%xmm2\n\t"5873 "vpxor %%xmm13, %%xmm3, %%xmm3\n\t"5874 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t"5875 5876 "vmovaps 144(%[KEY]), %%xmm12\n\t"5877 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t"5878 "vmovdqa %[MOD2_128], %%xmm0\n\t"5879 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t"5880 "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t"5881 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t"5882 "vpshufd $78, %%xmm2, %%xmm13\n\t"5883 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t"5884 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t"5885 "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t"5886 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t"5887 "vpshufd $78, %%xmm13, %%xmm13\n\t"5888 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t"5889 "vpxor %%xmm3, %%xmm13, %%xmm13\n\t"5890 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t"5891 "vmovdqa %%xmm13, %%xmm2\n\t"5892 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t"5893 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t"5894 "cmpl $11, %[nr]\n\t"5895 "vmovaps 160(%[KEY]), %%xmm12\n\t"5896 "jl %=f\n\t"5897 5898 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t"5899 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t"5900 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t"5901 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t"5902 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t"5903 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t"5904 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t"5905 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t"5906 "vmovaps 176(%[KEY]), %%xmm12\n\t"5907 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t"5908 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t"5909 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t"5910 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t"5911 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t"5912 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t"5913 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t"5914 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t"5915 "cmpl $13, %[nr]\n\t"5916 "vmovaps 192(%[KEY]), %%xmm12\n\t"5917 "jl %=f\n\t"5918 5919 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t"5920 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t"5921 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t"5922 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t"5923 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t"5924 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t"5925 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t"5926 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t"5927 "vmovaps 208(%[KEY]), %%xmm12\n\t"5928 "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t"5929 "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t"5930 "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t"5931 "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t"5932 "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t"5933 "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t"5934 "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t"5935 "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t"5936 "vmovaps 224(%[KEY]), %%xmm12\n\t"5937 5938 "%=:\n\t"5939 "vaesenclast %%xmm12, %%xmm4, %%xmm4\n\t"5940 "vaesenclast %%xmm12, %%xmm5, %%xmm5\n\t"5941 "vpxor (%[in]), %%xmm4, %%xmm4\n\t"5942 "vpxor 16(%[in]), %%xmm5, %%xmm5\n\t"5943 "vmovdqu %%xmm4, (%[out])\n\t"5944 "vmovdqu %%xmm5, 16(%[out])\n\t"5945 "vaesenclast %%xmm12, %%xmm6, %%xmm6\n\t"5946 "vaesenclast %%xmm12, %%xmm7, %%xmm7\n\t"5947 "vpxor 32(%[in]), %%xmm6, %%xmm6\n\t"5948 "vpxor 48(%[in]), %%xmm7, %%xmm7\n\t"5949 "vmovdqu %%xmm6, 32(%[out])\n\t"5950 "vmovdqu %%xmm7, 48(%[out])\n\t"5951 "vaesenclast %%xmm12, %%xmm8, %%xmm8\n\t"5952 "vaesenclast %%xmm12, %%xmm9, %%xmm9\n\t"5953 "vpxor 64(%[in]), %%xmm8, %%xmm8\n\t"5954 "vpxor 80(%[in]), %%xmm9, %%xmm9\n\t"5955 "vmovdqu %%xmm8, 64(%[out])\n\t"5956 "vmovdqu %%xmm9, 80(%[out])\n\t"5957 "vaesenclast %%xmm12, %%xmm10, %%xmm10\n\t"5958 "vaesenclast %%xmm12, %%xmm11, %%xmm11\n\t"5959 "vpxor 96(%[in]), %%xmm10, %%xmm10\n\t"5960 "vpxor 112(%[in]), %%xmm11, %%xmm11\n\t"5961 "vmovdqu %%xmm10, 96(%[out])\n\t"5962 "vmovdqu %%xmm11, 112(%[out])\n\t"5963 5964 : [XV] "+xr" (XV)5965 : [KEY] "r" (KEY), [HT] "r" (HT), [pctr1] "r" (pctr1),5966 [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr),5967 [BSWAP_MASK] "m" (BSWAP_MASK),5968 [BSWAP_EPI64] "m" (BSWAP_EPI64),5969 [ONE] "m" (ONE), [TWO] "m" (TWO),5970 [THREE] "m" (THREE), [FOUR] "m" (FOUR),5971 [FIVE] "m" (FIVE), [SIX] "m" (SIX),5972 [SEVEN] "m" (SEVEN), [EIGHT] "m" (EIGHT),5973 [MOD2_128] "m" (MOD2_128)5974 : "xmm15", "xmm14", "xmm13", "xmm12",5975 "xmm11", "xmm10", "xmm9", "xmm8",5976 "xmm7", "xmm6", "xmm5", "xmm4",5977 "xmm0", "xmm1", "xmm3", "memory"5978 );5979 }5980 X = XV;5981 ctr1 = pctr1[0];5982 }5983 #endif5984 for (k = i*8; k < nbytes/16; k++) {5985 __asm__ __volatile__ (5986 "vpshufb %[BSWAP_EPI64], %[ctr1], %%xmm4\n\t"5987 "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t"5988 "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t"5989 "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t"5990 "vmovaps %[H], %%xmm0\n\t"5991 "vmovdqu (%[in]), %%xmm1\n\t"5992 "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t"5993 "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t"5994 "vpxor %[X], %%xmm1, %%xmm1\n\t"5995 "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t"5996 "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t"5997 "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t"5998 "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t"5999 "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t"6000 "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t"6001 "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t"6002 "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t"6003 "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t"6004 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t"6005 "vpslldq $8, %%xmm13, %%xmm2\n\t"6006 "vpsrldq $8, %%xmm13, %%xmm13\n\t"6007 "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t"6008 "vpxor %%xmm15, %%xmm2, %%xmm2\n\t"6009 "vpxor %%xmm13, %%xmm1, %%xmm3\n\t"6010 "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t"6011 "# Reduce\n\t"6012 "vmovdqa %[MOD2_128], %%xmm0\n\t"6013 "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t"6014 "vpshufd $78, %%xmm2, %%xmm13\n\t"6015 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t"6016 "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t"6017 "vpshufd $78, %%xmm13, %%xmm13\n\t"6018 "vpxor %%xmm14, %%xmm13, %%xmm13\n\t"6019 "vpxor %%xmm3, %%xmm13, %%xmm13\n\t"6020 "vmovdqa %%xmm13, %[X]\n\t"6021 "# End Reduce\n\t"6022 "cmpl $11, %[nr]\n\t"6023 "vmovaps 160(%[KEY]), %%xmm5\n\t"6024 "jl %=f\n\t"6025 "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t"6026 "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t"6027 "cmpl $13, %[nr]\n\t"6028 "vmovaps 192(%[KEY]), %%xmm5\n\t"6029 "jl %=f\n\t"6030 "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t"6031 "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t"6032 "vmovaps 224(%[KEY]), %%xmm5\n\t"6033 "%=:\n\t"6034 "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t"6035 "vpxor (%[in]), %%xmm4, %%xmm4\n\t"6036 "vmovdqu %%xmm4, (%[out])\n\t"6037 6038 : [H] "+xr" (H), [X] "+xr" (X),6039 [ctr1] "+xr" (ctr1)6040 : [KEY] "r" (KEY),6041 [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr),6042 [BSWAP_MASK] "m" (BSWAP_MASK),6043 [BSWAP_EPI64] "m" (BSWAP_EPI64),6044 [ONE] "m" (ONE),6045 [MOD2_128] "m" (MOD2_128)6046 : "xmm15", "xmm14", "xmm13", "xmm4", "xmm5",6047 "xmm0", "xmm1", "xmm2", "xmm3", "memory"6048 );6049 }6050 6051 /* If one partial block remains */6052 if (nbytes % 16) {6053 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);6054 tmp1 = _mm_xor_si128(tmp1, KEY[0]);6055 tmp1 = _mm_aesenc_si128(tmp1, KEY[1]);6056 tmp1 = _mm_aesenc_si128(tmp1, KEY[2]);6057 tmp1 = _mm_aesenc_si128(tmp1, KEY[3]);6058 tmp1 = _mm_aesenc_si128(tmp1, KEY[4]);6059 tmp1 = _mm_aesenc_si128(tmp1, KEY[5]);6060 tmp1 = _mm_aesenc_si128(tmp1, KEY[6]);6061 tmp1 = _mm_aesenc_si128(tmp1, KEY[7]);6062 tmp1 = _mm_aesenc_si128(tmp1, KEY[8]);6063 tmp1 = _mm_aesenc_si128(tmp1, KEY[9]);6064 lastKey = KEY[10];6065 if (nr > 10) {6066 tmp1 = _mm_aesenc_si128(tmp1, lastKey);6067 tmp1 = _mm_aesenc_si128(tmp1, KEY[11]);6068 lastKey = KEY[12];6069 if (nr > 12) {6070 tmp1 = _mm_aesenc_si128(tmp1, lastKey);6071 tmp1 = _mm_aesenc_si128(tmp1, KEY[13]);6072 lastKey = KEY[14];6073 }6074 }6075 tmp1 = _mm_aesenclast_si128(tmp1, lastKey);6076 last_block = _mm_setzero_si128();6077 for (j=0; j < nbytes%16; j++)6078 ((unsigned char*)&last_block)[j] = in[k*16+j];6079 XV = last_block;6080 tmp1 = _mm_xor_si128(tmp1, last_block);6081 last_block = tmp1;6082 for (j=0; j < nbytes%16; j++)6083 out[k*16+j] = ((unsigned char*)&last_block)[j];6084 XV = _mm_shuffle_epi8(XV, BSWAP_MASK);6085 XV = _mm_xor_si128(XV, X);6086 X = gfmul_shifted(XV, H);6087 }6088 6089 tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0);6090 tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1);6091 /* 128 x 128 Carryless Multiply */6092 X = _mm_xor_si128(X, tmp1);6093 X = gfmul_shifted(X, H);6094 X = _mm_shuffle_epi8(X, BSWAP_MASK);6095 T = _mm_xor_si128(X, T);6096 6097 if (0xffff !=6098 _mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag))))6099 return 0; /* in case the authentication failed */6100 6101 return 1; /* when successful returns 1 */6102 }6103 #endif /* HAVE_INTEL_AVX2 */6104 7631 #endif /* HAVE_AES_DECRYPT */ 7632 #endif /* _MSC_VER */ 6105 7633 #endif /* WOLFSSL_AESNI */ 6106 7634 … … 6483 8011 Z[2] ^= V[2]; 6484 8012 Z[3] ^= V[3]; 6485 8013 } 6486 8014 6487 8015 if (V[3] & 0x00000001) { … … 6502 8030 V[1] |= ((V[0] & 0x00000001) ? 0x80000000 : 0); 6503 8031 V[0] >>= 1; 6504 8032 } 6505 8033 y <<= 1; 6506 8034 } … … 6613 8141 6614 8142 #if !defined(WOLFSSL_XILINX_CRYPT) 8143 #ifdef FREESCALE_LTC_AES_GCM 6615 8144 int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, 6616 8145 const byte* iv, word32 ivSz, 6617 8146 byte* authTag, word32 authTagSz, 6618 8147 const byte* authIn, word32 authInSz) 8148 { 8149 status_t status; 8150 word32 keySize; 8151 8152 /* argument checks */ 8153 if (aes == NULL || authTagSz > AES_BLOCK_SIZE) { 8154 return BAD_FUNC_ARG; 8155 } 8156 8157 if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ) { 8158 WOLFSSL_MSG("GcmEncrypt authTagSz too small error"); 8159 return BAD_FUNC_ARG; 8160 } 8161 8162 status = wc_AesGetKeySize(aes, &keySize); 8163 if (status) 8164 return status; 8165 8166 status = LTC_AES_EncryptTagGcm(LTC_BASE, in, out, sz, iv, ivSz, 8167 authIn, authInSz, (byte*)aes->key, keySize, authTag, authTagSz); 8168 8169 return (status == kStatus_Success) ? 0 : AES_GCM_AUTH_E; 8170 } 8171 #else 8172 #if defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || \ 8173 defined(WOLFSSL_STM32F7) || \ 8174 defined(WOLFSSL_STM32L4)) 8175 8176 static WC_INLINE int wc_AesGcmEncrypt_STM32(Aes* aes, byte* out, const byte* in, 8177 word32 sz, const byte* iv, word32 ivSz, 8178 byte* authTag, word32 authTagSz, 8179 const byte* authIn, word32 authInSz) 8180 { 8181 int ret; 8182 #ifdef WOLFSSL_STM32_CUBEMX 8183 CRYP_HandleTypeDef hcryp; 8184 #else 8185 word32 keyCopy[AES_256_KEY_SIZE/sizeof(word32)]; 8186 #endif 8187 word32 keySize; 8188 int status = 0; 8189 int outPadSz, authPadSz; 8190 word32 tag[AES_BLOCK_SIZE/sizeof(word32)]; 8191 word32 initialCounter[AES_BLOCK_SIZE/sizeof(word32)]; 8192 byte* outPadded = NULL; 8193 byte* authInPadded = NULL; 8194 8195 ret = wc_AesGetKeySize(aes, &keySize); 8196 if (ret != 0) 8197 return ret; 8198 8199 #ifdef WOLFSSL_STM32_CUBEMX 8200 ret = wc_Stm32_Aes_Init(aes, &hcryp); 8201 if (ret != 0) 8202 return ret; 8203 #endif 8204 8205 XMEMSET(initialCounter, 0, sizeof(initialCounter)); 8206 XMEMCPY(initialCounter, iv, ivSz); 8207 *((byte*)initialCounter + (AES_BLOCK_SIZE - 1)) = STM32_GCM_IV_START; 8208 8209 /* Need to pad the AAD and input cipher text to a full block size since 8210 * CRYP_AES_GCM will assume these are a multiple of AES_BLOCK_SIZE. 8211 * It is okay to pad with zeros because GCM does this before GHASH already. 8212 * See NIST SP 800-38D */ 8213 if ((sz % AES_BLOCK_SIZE) != 0 || sz == 0) { 8214 outPadSz = ((sz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE; 8215 outPadded = (byte*)XMALLOC(outPadSz, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 8216 if (outPadded == NULL) { 8217 return MEMORY_E; 8218 } 8219 XMEMSET(outPadded, 0, outPadSz); 8220 } 8221 else { 8222 outPadSz = sz; 8223 outPadded = out; 8224 } 8225 XMEMCPY(outPadded, in, sz); 8226 8227 if (authInSz == 0 || (authInSz % AES_BLOCK_SIZE) != 0) { 8228 /* Need to pad the AAD to a full block with zeros. */ 8229 authPadSz = ((authInSz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE; 8230 authInPadded = (byte*)XMALLOC(authPadSz, aes->heap, 8231 DYNAMIC_TYPE_TMP_BUFFER); 8232 if (authInPadded == NULL) { 8233 if (outPadded != out) { 8234 XFREE(outPadded, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 8235 } 8236 return MEMORY_E; 8237 } 8238 XMEMSET(authInPadded, 0, authPadSz); 8239 XMEMCPY(authInPadded, authIn, authInSz); 8240 } else { 8241 authPadSz = authInSz; 8242 authInPadded = (byte*)authIn; 8243 } 8244 8245 8246 #ifdef WOLFSSL_STM32_CUBEMX 8247 hcryp.Init.pInitVect = (uint8_t*)initialCounter; 8248 hcryp.Init.Header = authInPadded; 8249 hcryp.Init.HeaderSize = authInSz; 8250 8251 #ifdef STM32_CRYPTO_AES_ONLY 8252 /* Set the CRYP parameters */ 8253 hcryp.Init.ChainingMode = CRYP_CHAINMODE_AES_GCM_GMAC; 8254 hcryp.Init.OperatingMode = CRYP_ALGOMODE_ENCRYPT; 8255 hcryp.Init.GCMCMACPhase = CRYP_INIT_PHASE; 8256 HAL_CRYP_Init(&hcryp); 8257 8258 /* GCM init phase */ 8259 status = HAL_CRYPEx_AES_Auth(&hcryp, NULL, 0, NULL, STM32_HAL_TIMEOUT); 8260 if (status == HAL_OK) { 8261 /* GCM header phase */ 8262 hcryp.Init.GCMCMACPhase = CRYP_HEADER_PHASE; 8263 status = HAL_CRYPEx_AES_Auth(&hcryp, NULL, 0, NULL, STM32_HAL_TIMEOUT); 8264 if (status == HAL_OK) { 8265 /* GCM payload phase */ 8266 hcryp.Init.GCMCMACPhase = CRYP_PAYLOAD_PHASE; 8267 status = HAL_CRYPEx_AES_Auth(&hcryp, outPadded, sz, outPadded, 8268 STM32_HAL_TIMEOUT); 8269 if (status == HAL_OK) { 8270 /* GCM final phase */ 8271 hcryp.Init.GCMCMACPhase = CRYP_FINAL_PHASE; 8272 status = HAL_CRYPEx_AES_Auth(&hcryp, NULL, sz, (byte*)tag, 8273 STM32_HAL_TIMEOUT); 8274 } 8275 } 8276 } 8277 #else 8278 HAL_CRYP_Init(&hcryp); 8279 8280 status = HAL_CRYPEx_AESGCM_Encrypt(&hcryp, outPadded, sz, 8281 outPadded, STM32_HAL_TIMEOUT); 8282 /* Compute the authTag */ 8283 if (status == HAL_OK) { 8284 status = HAL_CRYPEx_AESGCM_Finish(&hcryp, sz, (byte*)tag, 8285 STM32_HAL_TIMEOUT); 8286 } 8287 #endif 8288 8289 if (status != HAL_OK) 8290 ret = AES_GCM_AUTH_E; 8291 HAL_CRYP_DeInit(&hcryp); 8292 8293 #else /* STD_PERI_LIB */ 8294 ByteReverseWords(keyCopy, (word32*)aes->key, keySize); 8295 status = CRYP_AES_GCM(MODE_ENCRYPT, (uint8_t*)initialCounter, 8296 (uint8_t*)keyCopy, keySize * 8, 8297 (uint8_t*)outPadded, sz, 8298 (uint8_t*)authInPadded,authInSz, 8299 (uint8_t*)outPadded, (byte*)tag); 8300 if (status != SUCCESS) 8301 ret = AES_GCM_AUTH_E; 8302 #endif /* WOLFSSL_STM32_CUBEMX */ 8303 8304 if (ret == 0) { 8305 /* return authTag */ 8306 XMEMCPY(authTag, tag, authTagSz); 8307 8308 /* return output if allocated padded used */ 8309 if (outPadded != out) { 8310 XMEMCPY(out, outPadded, sz); 8311 } 8312 } 8313 8314 /* Free memory if not a multiple of AES_BLOCK_SZ */ 8315 if (outPadded != out) { 8316 XFREE(outPadded, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 8317 } 8318 if (authInPadded != authIn) { 8319 XFREE(authInPadded, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 8320 } 8321 8322 return ret; 8323 } 8324 #endif /* STM32_CRYPTO */ 8325 8326 #ifdef WOLFSSL_AESNI 8327 int AES_GCM_encrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, 8328 const byte* iv, word32 ivSz, 8329 byte* authTag, word32 authTagSz, 8330 const byte* authIn, word32 authInSz); 8331 #else 8332 static 8333 #endif 8334 int AES_GCM_encrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, 8335 const byte* iv, word32 ivSz, 8336 byte* authTag, word32 authTagSz, 8337 const byte* authIn, word32 authInSz) 6619 8338 { 6620 8339 int ret = 0; 6621 word32 keySize;6622 #ifdef FREESCALE_LTC_AES_GCM6623 status_t status;6624 #else6625 8340 word32 blocks = sz / AES_BLOCK_SIZE; 6626 8341 word32 partial = sz % AES_BLOCK_SIZE; … … 6631 8346 byte *ctr; 6632 8347 byte scratch[AES_BLOCK_SIZE]; 6633 #if defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || defined(WOLFSSL_STM32F7)) 6634 #ifdef WOLFSSL_STM32_CUBEMX 6635 CRYP_HandleTypeDef hcryp; 6636 #else 6637 byte keyCopy[AES_BLOCK_SIZE * 2]; 6638 #endif /* WOLFSSL_STM32_CUBEMX */ 6639 int status = 0; 6640 byte* authInPadded = NULL; 6641 byte tag[AES_BLOCK_SIZE]; 6642 int authPadSz; 6643 #endif /* STM32_CRYPTO */ 6644 #endif /* FREESCALE_LTC_AES_GCM */ 6645 8348 8349 ctr = counter; 8350 XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); 8351 if (ivSz == GCM_NONCE_MID_SZ) { 8352 XMEMCPY(initialCounter, iv, ivSz); 8353 initialCounter[AES_BLOCK_SIZE - 1] = 1; 8354 } 8355 else { 8356 GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); 8357 } 8358 XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE); 8359 8360 #ifdef WOLFSSL_PIC32MZ_CRYPT 8361 if (blocks) { 8362 /* use initial IV for PIC32 HW, but don't use it below */ 8363 XMEMCPY(aes->reg, ctr, AES_BLOCK_SIZE); 8364 8365 ret = wc_Pic32AesCrypt( 8366 aes->key, aes->keylen, aes->reg, AES_BLOCK_SIZE, 8367 out, in, (blocks * AES_BLOCK_SIZE), 8368 PIC32_ENCRYPTION, PIC32_ALGO_AES, PIC32_CRYPTOALGO_AES_GCM); 8369 if (ret != 0) 8370 return ret; 8371 } 8372 /* process remainder using partial handling */ 8373 #endif 8374 8375 #if defined(HAVE_AES_ECB) && !defined(WOLFSSL_PIC32MZ_CRYPT) 8376 /* some hardware acceleration can gain performance from doing AES encryption 8377 * of the whole buffer at once */ 8378 if (c != p) { /* can not handle inline encryption */ 8379 while (blocks--) { 8380 IncrementGcmCounter(ctr); 8381 XMEMCPY(c, ctr, AES_BLOCK_SIZE); 8382 c += AES_BLOCK_SIZE; 8383 } 8384 8385 /* reset number of blocks and then do encryption */ 8386 blocks = sz / AES_BLOCK_SIZE; 8387 wc_AesEcbEncrypt(aes, out, out, AES_BLOCK_SIZE * blocks); 8388 xorbuf(out, p, AES_BLOCK_SIZE * blocks); 8389 p += AES_BLOCK_SIZE * blocks; 8390 } 8391 else 8392 #endif /* HAVE_AES_ECB */ 8393 8394 while (blocks--) { 8395 IncrementGcmCounter(ctr); 8396 #ifndef WOLFSSL_PIC32MZ_CRYPT 8397 wc_AesEncrypt(aes, ctr, scratch); 8398 xorbuf(scratch, p, AES_BLOCK_SIZE); 8399 XMEMCPY(c, scratch, AES_BLOCK_SIZE); 8400 #endif 8401 p += AES_BLOCK_SIZE; 8402 c += AES_BLOCK_SIZE; 8403 } 8404 8405 if (partial != 0) { 8406 IncrementGcmCounter(ctr); 8407 wc_AesEncrypt(aes, ctr, scratch); 8408 xorbuf(scratch, p, partial); 8409 XMEMCPY(c, scratch, partial); 8410 } 8411 8412 GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz); 8413 wc_AesEncrypt(aes, initialCounter, scratch); 8414 xorbuf(authTag, scratch, authTagSz); 8415 8416 return ret; 8417 } 8418 8419 /* Software AES - GCM Encrypt */ 8420 int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, 8421 const byte* iv, word32 ivSz, 8422 byte* authTag, word32 authTagSz, 8423 const byte* authIn, word32 authInSz) 8424 { 6646 8425 /* argument checks */ 6647 8426 if (aes == NULL || authTagSz > AES_BLOCK_SIZE) { 6648 8427 return BAD_FUNC_ARG; 6649 }8428 } 6650 8429 6651 8430 if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ) { 6652 8431 WOLFSSL_MSG("GcmEncrypt authTagSz too small error"); 6653 8432 return BAD_FUNC_ARG; 6654 } 6655 6656 ret = wc_AesGetKeySize(aes, &keySize); 6657 if (ret != 0) 6658 return ret; 6659 6660 #ifdef FREESCALE_LTC_AES_GCM 6661 6662 status = LTC_AES_EncryptTagGcm(LTC_BASE, in, out, sz, iv, ivSz, 6663 authIn, authInSz, (byte*)aes->key, keySize, authTag, authTagSz); 6664 6665 ret = (status == kStatus_Success) ? 0 : AES_GCM_AUTH_E; 6666 6667 #else 6668 6669 #if defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || defined(WOLFSSL_STM32F7)) 6670 6671 /* additional argument checks - STM32 HW only supports 12 byte IV */ 6672 if (ivSz != NONCE_SZ) { 6673 return BAD_FUNC_ARG; 6674 } 6675 6676 XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); 6677 XMEMCPY(initialCounter, iv, ivSz); 6678 initialCounter[AES_BLOCK_SIZE - 1] = STM32_GCM_IV_START; 6679 6680 /* STM32 HW AES-GCM requires / assumes inputs are a multiple of block size. 6681 * We can avoid this by zero padding (authIn) AAD, but zero-padded plaintext 6682 * will be encrypted and output incorrectly, causing a bad authTag. 6683 * We will use HW accelerated AES-GCM if plain%AES_BLOCK_SZ==0. 6684 * Otherwise, we will use accelerated AES_CTR for encrypt, and then 6685 * perform GHASH in software. 6686 * See NIST SP 800-38D */ 6687 6688 /* Plain text is a multiple of block size, so use HW-Accelerated AES_GCM */ 6689 if (!partial) { 6690 /* pad authIn if it is not a block multiple */ 6691 if ((authInSz % AES_BLOCK_SIZE) != 0) { 6692 authPadSz = ((authInSz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE; 6693 /* Need to pad the AAD to a full block with zeros. */ 6694 authInPadded = XMALLOC(authPadSz, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 6695 if (authInPadded == NULL) { 6696 return MEMORY_E; 6697 } 6698 XMEMSET(authInPadded, 0, authPadSz); 6699 XMEMCPY(authInPadded, authIn, authInSz); 6700 } else { 6701 authPadSz = authInSz; 6702 authInPadded = (byte*)authIn; 6703 } 6704 6705 6706 #ifdef WOLFSSL_STM32_CUBEMX 6707 XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef)); 6708 switch (keySize) { 6709 case 16: /* 128-bit key */ 6710 hcryp.Init.KeySize = CRYP_KEYSIZE_128B; 6711 break; 6712 case 24: /* 192-bit key */ 6713 hcryp.Init.KeySize = CRYP_KEYSIZE_192B; 6714 break; 6715 case 32: /* 256-bit key */ 6716 hcryp.Init.KeySize = CRYP_KEYSIZE_256B; 6717 break; 6718 default: 6719 break; 6720 } 6721 hcryp.Instance = CRYP; 6722 hcryp.Init.DataType = CRYP_DATATYPE_8B; 6723 hcryp.Init.pKey = (byte*)aes->key; 6724 hcryp.Init.pInitVect = initialCounter; 6725 hcryp.Init.Header = authInPadded; 6726 hcryp.Init.HeaderSize = authInSz; 6727 6728 HAL_CRYP_Init(&hcryp); 6729 status = HAL_CRYPEx_AESGCM_Encrypt(&hcryp, (byte*)in, sz, 6730 out, STM32_HAL_TIMEOUT); 6731 /* Compute the authTag */ 6732 if (status == HAL_OK) 6733 status = HAL_CRYPEx_AESGCM_Finish(&hcryp, sz, tag, STM32_HAL_TIMEOUT); 6734 6735 if (status != HAL_OK) 6736 ret = AES_GCM_AUTH_E; 6737 HAL_CRYP_DeInit(&hcryp); 6738 #else 6739 ByteReverseWords((word32*)keyCopy, (word32*)aes->key, keySize); 6740 status = CRYP_AES_GCM(MODE_ENCRYPT, (uint8_t*)initialCounter, 6741 (uint8_t*)keyCopy, keySize * 8, 6742 (uint8_t*)in, sz, 6743 (uint8_t*)authInPadded,authInSz, 6744 (uint8_t*)out, tag); 6745 if (status != SUCCESS) 6746 ret = AES_GCM_AUTH_E; 6747 #endif /* WOLFSSL_STM32_CUBEMX */ 6748 6749 /* authTag may be shorter than AES_BLOCK_SZ, store separately */ 6750 if (ret == 0) 6751 XMEMCPY(authTag, tag, authTagSz); 6752 6753 /* We only allocate extra memory if authInPadded is not a multiple of AES_BLOCK_SZ */ 6754 if (authInPadded != NULL && authInSz != authPadSz) { 6755 XFREE(authInPadded, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 6756 } 6757 6758 return ret; 6759 } 6760 8433 } 8434 8435 #ifdef WOLF_CRYPTO_DEV 8436 if (aes->devId != INVALID_DEVID) { 8437 int ret = wc_CryptoDev_AesGcmEncrypt(aes, out, in, sz, iv, ivSz, 8438 authTag, authTagSz, authIn, authInSz); 8439 if (ret != NOT_COMPILED_IN) 8440 return ret; 8441 ret = 0; /* reset error code and try using software */ 8442 } 6761 8443 #endif 6762 8444 6763 /* Software AES-GCM */ 8445 #if defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || \ 8446 defined(WOLFSSL_STM32F7) || \ 8447 defined(WOLFSSL_STM32L4)) 8448 8449 /* STM32 HW only supports 12 byte IV and 16 byte auth */ 8450 if (ivSz == GCM_NONCE_MID_SZ && authInSz == AES_BLOCK_SIZE) { 8451 return wc_AesGcmEncrypt_STM32(aes, out, in, sz, iv, ivSz, 8452 authTag, authTagSz, authIn, authInSz); 8453 } 8454 #endif 6764 8455 6765 8456 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES) 6766 8457 /* if async and byte count above threshold */ 8458 /* only 12-byte IV is supported in HW */ 6767 8459 if (aes->asyncDev.marker == WOLFSSL_ASYNC_MARKER_AES && 6768 sz >= WC_ASYNC_THRESH_AES_GCM) {8460 sz >= WC_ASYNC_THRESH_AES_GCM && ivSz == GCM_NONCE_MID_SZ) { 6769 8461 #if defined(HAVE_CAVIUM) 6770 /* Not yet supported, contact wolfSSL if interested in using */ 8462 #ifdef HAVE_CAVIUM_V 8463 if (authInSz == 20) { /* Nitrox V GCM is only working with 20 byte AAD */ 8464 return NitroxAesGcmEncrypt(aes, out, in, sz, 8465 (const byte*)aes->asyncKey, aes->keylen, iv, ivSz, 8466 authTag, authTagSz, authIn, authInSz); 8467 } 8468 #endif 6771 8469 #elif defined(HAVE_INTEL_QA) 6772 8470 return IntelQaSymAesGcmEncrypt(&aes->asyncDev, out, in, sz, … … 6788 8486 return WC_PENDING_E; 6789 8487 } 8488 #endif 8489 } 8490 #endif /* WOLFSSL_ASYNC_CRYPT */ 8491 8492 #ifdef WOLFSSL_AESNI 8493 #ifdef HAVE_INTEL_AVX2 8494 if (IS_INTEL_AVX2(intel_flags)) { 8495 AES_GCM_encrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, ivSz, 8496 authTagSz, (const byte*)aes->key, aes->rounds); 8497 return 0; 8498 } 8499 else 8500 #endif 8501 #ifdef HAVE_INTEL_AVX1 8502 if (IS_INTEL_AVX1(intel_flags)) { 8503 AES_GCM_encrypt_avx1(in, out, authIn, iv, authTag, sz, authInSz, ivSz, 8504 authTagSz, (const byte*)aes->key, aes->rounds); 8505 return 0; 8506 } 8507 else 6790 8508 #endif 6791 }6792 #endif /* WOLFSSL_ASYNC_CRYPT */6793 6794 #ifdef WOLFSSL_AESNI6795 8509 if (haveAESNI) { 6796 #ifdef HAVE_INTEL_AVX2 6797 if (IS_INTEL_AVX2(intel_flags)) { 6798 AES_GCM_encrypt_avx2(in, out, authIn, iv, authTag, 6799 sz, authInSz, ivSz, (const byte*)aes->key, aes->rounds); 6800 } 6801 else 6802 #endif 6803 AES_GCM_encrypt(in, out, authIn, iv, authTag, 6804 sz, authInSz, ivSz, (const byte*)aes->key, aes->rounds); 8510 AES_GCM_encrypt(in, out, authIn, iv, authTag, sz, authInSz, ivSz, 8511 authTagSz, (const byte*)aes->key, aes->rounds); 6805 8512 return 0; 6806 8513 } 8514 else 6807 8515 #endif 6808 6809 ctr = counter; 6810 XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); 6811 if (ivSz == NONCE_SZ) { 6812 XMEMCPY(initialCounter, iv, ivSz); 6813 initialCounter[AES_BLOCK_SIZE - 1] = 1; 6814 } 6815 else { 6816 GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); 6817 } 6818 XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE); 6819 6820 #ifdef WOLFSSL_PIC32MZ_CRYPT 6821 if (blocks) { 6822 /* use intitial IV for PIC32 HW, but don't use it below */ 6823 XMEMCPY(aes->reg, ctr, AES_BLOCK_SIZE); 6824 6825 ret = wc_Pic32AesCrypt( 6826 aes->key, aes->keylen, aes->reg, AES_BLOCK_SIZE, 6827 out, in, (blocks * AES_BLOCK_SIZE), 6828 PIC32_ENCRYPTION, PIC32_ALGO_AES, PIC32_CRYPTOALGO_AES_GCM); 6829 if (ret != 0) 6830 return ret; 6831 } 6832 /* process remainder using partial handling */ 8516 { 8517 return AES_GCM_encrypt_C(aes, out, in, sz, iv, ivSz, authTag, authTagSz, 8518 authIn, authInSz); 8519 } 8520 } 6833 8521 #endif 6834 while (blocks--) { 6835 IncrementGcmCounter(ctr); 6836 #ifndef WOLFSSL_PIC32MZ_CRYPT 6837 wc_AesEncrypt(aes, ctr, scratch); 6838 xorbuf(scratch, p, AES_BLOCK_SIZE); 6839 XMEMCPY(c, scratch, AES_BLOCK_SIZE); 6840 #endif 6841 p += AES_BLOCK_SIZE; 6842 c += AES_BLOCK_SIZE; 6843 } 6844 6845 if (partial != 0) { 6846 IncrementGcmCounter(ctr); 6847 wc_AesEncrypt(aes, ctr, scratch); 6848 xorbuf(scratch, p, partial); 6849 XMEMCPY(c, scratch, partial); 6850 6851 } 6852 6853 GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz); 6854 wc_AesEncrypt(aes, initialCounter, scratch); 6855 xorbuf(authTag, scratch, authTagSz); 6856 6857 #endif /* FREESCALE_LTC_AES_GCM */ 6858 6859 return ret; 6860 } 6861 6862 8522 8523 8524 8525 /* AES GCM Decrypt */ 6863 8526 #if defined(HAVE_AES_DECRYPT) || defined(HAVE_AESGCM_DECRYPT) 8527 #ifdef FREESCALE_LTC_AES_GCM 6864 8528 int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, 6865 8529 const byte* iv, word32 ivSz, … … 6867 8531 const byte* authIn, word32 authInSz) 6868 8532 { 8533 int ret; 8534 word32 keySize; 8535 status_t status; 8536 8537 /* argument checks */ 8538 /* If the sz is non-zero, both in and out must be set. If sz is 0, 8539 * in and out are don't cares, as this is is the GMAC case. */ 8540 if (aes == NULL || iv == NULL || (sz != 0 && (in == NULL || out == NULL)) || 8541 authTag == NULL || authTagSz > AES_BLOCK_SIZE || authTagSz == 0) { 8542 8543 return BAD_FUNC_ARG; 8544 } 8545 8546 ret = wc_AesGetKeySize(aes, &keySize); 8547 if (ret != 0) { 8548 return ret; 8549 } 8550 8551 status = LTC_AES_DecryptTagGcm(LTC_BASE, in, out, sz, iv, ivSz, 8552 authIn, authInSz, (byte*)aes->key, keySize, authTag, authTagSz); 8553 8554 return (status == kStatus_Success) ? 0 : AES_GCM_AUTH_E; 8555 } 8556 8557 #else 8558 8559 #if defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || \ 8560 defined(WOLFSSL_STM32F7) || \ 8561 defined(WOLFSSL_STM32L4)) 8562 static WC_INLINE int wc_AesGcmDecrypt_STM32(Aes* aes, byte* out, 8563 const byte* in, word32 sz, 8564 const byte* iv, word32 ivSz, 8565 const byte* authTag, word32 authTagSz, 8566 const byte* authIn, word32 authInSz) 8567 { 8568 int ret; 8569 #ifdef WOLFSSL_STM32_CUBEMX 8570 CRYP_HandleTypeDef hcryp; 8571 #else 8572 word32 keyCopy[AES_256_KEY_SIZE/sizeof(word32)]; 8573 #endif 8574 word32 keySize; 8575 int status; 8576 int outPadSz, authPadSz; 8577 word32 tag[AES_BLOCK_SIZE/sizeof(word32)]; 8578 word32 initialCounter[AES_BLOCK_SIZE/sizeof(word32)]; 8579 byte* outPadded = NULL; 8580 byte* authInPadded = NULL; 8581 8582 ret = wc_AesGetKeySize(aes, &keySize); 8583 if (ret != 0) 8584 return ret; 8585 8586 #ifdef WOLFSSL_STM32_CUBEMX 8587 ret = wc_Stm32_Aes_Init(aes, &hcryp); 8588 if (ret != 0) 8589 return ret; 8590 #endif 8591 8592 XMEMSET(initialCounter, 0, sizeof(initialCounter)); 8593 XMEMCPY(initialCounter, iv, ivSz); 8594 *((byte*)initialCounter + (AES_BLOCK_SIZE - 1)) = STM32_GCM_IV_START; 8595 8596 /* Need to pad the AAD and input cipher text to a full block size since 8597 * CRYP_AES_GCM will assume these are a multiple of AES_BLOCK_SIZE. 8598 * It is okay to pad with zeros because GCM does this before GHASH already. 8599 * See NIST SP 800-38D */ 8600 if ((sz % AES_BLOCK_SIZE) != 0 || sz == 0) { 8601 outPadSz = ((sz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE; 8602 outPadded = (byte*)XMALLOC(outPadSz, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 8603 if (outPadded == NULL) { 8604 return MEMORY_E; 8605 } 8606 XMEMSET(outPadded, 0, outPadSz); 8607 } 8608 else { 8609 outPadSz = sz; 8610 outPadded = out; 8611 } 8612 XMEMCPY(outPadded, in, sz); 8613 8614 if (authInSz == 0 || (authInSz % AES_BLOCK_SIZE) != 0) { 8615 /* Need to pad the AAD to a full block with zeros. */ 8616 authPadSz = ((authInSz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE; 8617 authInPadded = (byte*)XMALLOC(authPadSz, aes->heap, 8618 DYNAMIC_TYPE_TMP_BUFFER); 8619 if (authInPadded == NULL) { 8620 if (outPadded != out) { 8621 XFREE(outPadded, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 8622 } 8623 return MEMORY_E; 8624 } 8625 XMEMSET(authInPadded, 0, authPadSz); 8626 XMEMCPY(authInPadded, authIn, authInSz); 8627 } else { 8628 authPadSz = authInSz; 8629 authInPadded = (byte*)authIn; 8630 } 8631 8632 #ifdef WOLFSSL_STM32_CUBEMX 8633 hcryp.Init.pInitVect = (uint8_t*)initialCounter; 8634 hcryp.Init.Header = authInPadded; 8635 hcryp.Init.HeaderSize = authInSz; 8636 8637 #ifdef STM32_CRYPTO_AES_ONLY 8638 /* Set the CRYP parameters */ 8639 hcryp.Init.ChainingMode = CRYP_CHAINMODE_AES_GCM_GMAC; 8640 hcryp.Init.OperatingMode = CRYP_ALGOMODE_DECRYPT; 8641 hcryp.Init.GCMCMACPhase = CRYP_INIT_PHASE; 8642 HAL_CRYP_Init(&hcryp); 8643 8644 /* GCM init phase */ 8645 status = HAL_CRYPEx_AES_Auth(&hcryp, NULL, 0, NULL, STM32_HAL_TIMEOUT); 8646 if (status == HAL_OK) { 8647 /* GCM header phase */ 8648 hcryp.Init.GCMCMACPhase = CRYP_HEADER_PHASE; 8649 status = HAL_CRYPEx_AES_Auth(&hcryp, NULL, 0, NULL, STM32_HAL_TIMEOUT); 8650 if (status == HAL_OK) { 8651 /* GCM payload phase */ 8652 hcryp.Init.GCMCMACPhase = CRYP_PAYLOAD_PHASE; 8653 status = HAL_CRYPEx_AES_Auth(&hcryp, outPadded, sz, outPadded, 8654 STM32_HAL_TIMEOUT); 8655 if (status == HAL_OK) { 8656 /* GCM final phase */ 8657 hcryp.Init.GCMCMACPhase = CRYP_FINAL_PHASE; 8658 status = HAL_CRYPEx_AES_Auth(&hcryp, NULL, sz, (byte*)tag, 8659 STM32_HAL_TIMEOUT); 8660 } 8661 } 8662 } 8663 #else 8664 HAL_CRYP_Init(&hcryp); 8665 /* Use outPadded for output buffer instead of out so that we don't overflow 8666 * our size. */ 8667 status = HAL_CRYPEx_AESGCM_Decrypt(&hcryp, outPadded, sz, outPadded, 8668 STM32_HAL_TIMEOUT); 8669 /* Compute the authTag */ 8670 if (status == HAL_OK) { 8671 status = HAL_CRYPEx_AESGCM_Finish(&hcryp, sz, (byte*)tag, 8672 STM32_HAL_TIMEOUT); 8673 } 8674 #endif 8675 8676 if (status != HAL_OK) 8677 ret = AES_GCM_AUTH_E; 8678 8679 HAL_CRYP_DeInit(&hcryp); 8680 8681 #else /* STD_PERI_LIB */ 8682 ByteReverseWords(keyCopy, (word32*)aes->key, aes->keylen); 8683 8684 /* Input size and auth size need to be the actual sizes, even though 8685 * they are not block aligned, because this length (in bits) is used 8686 * in the final GHASH. Use outPadded for output buffer instead of 8687 * out so that we don't overflow our size. */ 8688 status = CRYP_AES_GCM(MODE_DECRYPT, (uint8_t*)initialCounter, 8689 (uint8_t*)keyCopy, keySize * 8, 8690 (uint8_t*)outPadded, sz, 8691 (uint8_t*)authInPadded,authInSz, 8692 (uint8_t*)outPadded, (byte*)tag); 8693 if (status != SUCCESS) 8694 ret = AES_GCM_AUTH_E; 8695 #endif /* WOLFSSL_STM32_CUBEMX */ 8696 8697 if (ConstantCompare(authTag, (byte*)tag, authTagSz) != 0) { 8698 ret = AES_GCM_AUTH_E; 8699 } 8700 8701 if (ret == 0) { 8702 /* return output if allocated padded used */ 8703 if (outPadded != out) { 8704 XMEMCPY(out, outPadded, sz); 8705 } 8706 } 8707 8708 /* Free memory if not a multiple of AES_BLOCK_SZ */ 8709 if (outPadded != out) { 8710 XFREE(outPadded, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 8711 } 8712 if (authInPadded != authIn) { 8713 XFREE(authInPadded, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 8714 } 8715 8716 return ret; 8717 } 8718 #endif /* STM32 */ 8719 8720 #ifdef WOLFSSL_AESNI 8721 int AES_GCM_decrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, 8722 const byte* iv, word32 ivSz, 8723 const byte* authTag, word32 authTagSz, 8724 const byte* authIn, word32 authInSz); 8725 #else 8726 static 8727 #endif 8728 int AES_GCM_decrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, 8729 const byte* iv, word32 ivSz, 8730 const byte* authTag, word32 authTagSz, 8731 const byte* authIn, word32 authInSz) 8732 { 6869 8733 int ret = 0; 6870 word32 keySize;6871 #ifdef FREESCALE_LTC_AES_GCM6872 status_t status;6873 #elif defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || defined(WOLFSSL_STM32F7))6874 #ifdef WOLFSSL_STM32_CUBEMX6875 CRYP_HandleTypeDef hcryp;6876 #else6877 byte keyCopy[AES_BLOCK_SIZE * 2];6878 #endif /* WOLFSSL_STM32_CUBEMX */6879 int status;6880 int inPadSz, authPadSz;6881 byte tag[AES_BLOCK_SIZE];6882 byte *inPadded = NULL;6883 byte *authInPadded = NULL;6884 byte initialCounter[AES_BLOCK_SIZE];6885 #else /* software AES-GCM */6886 8734 word32 blocks = sz / AES_BLOCK_SIZE; 6887 8735 word32 partial = sz % AES_BLOCK_SIZE; … … 6894 8742 byte Tprime[AES_BLOCK_SIZE]; 6895 8743 byte EKY0[AES_BLOCK_SIZE]; 8744 ctr = counter; 8745 8746 XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); 8747 if (ivSz == GCM_NONCE_MID_SZ) { 8748 XMEMCPY(initialCounter, iv, ivSz); 8749 initialCounter[AES_BLOCK_SIZE - 1] = 1; 8750 } 8751 else { 8752 GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); 8753 } 8754 XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE); 8755 8756 /* Calc the authTag again using the received auth data and the cipher text */ 8757 GHASH(aes, authIn, authInSz, in, sz, Tprime, sizeof(Tprime)); 8758 wc_AesEncrypt(aes, ctr, EKY0); 8759 xorbuf(Tprime, EKY0, sizeof(Tprime)); 8760 8761 if (ConstantCompare(authTag, Tprime, authTagSz) != 0) { 8762 return AES_GCM_AUTH_E; 8763 } 8764 8765 #ifdef WOLFSSL_PIC32MZ_CRYPT 8766 if (blocks) { 8767 /* use initial IV for PIC32 HW, but don't use it below */ 8768 XMEMCPY(aes->reg, ctr, AES_BLOCK_SIZE); 8769 8770 ret = wc_Pic32AesCrypt( 8771 aes->key, aes->keylen, aes->reg, AES_BLOCK_SIZE, 8772 out, in, (blocks * AES_BLOCK_SIZE), 8773 PIC32_DECRYPTION, PIC32_ALGO_AES, PIC32_CRYPTOALGO_AES_GCM); 8774 if (ret != 0) 8775 return ret; 8776 } 8777 /* process remainder using partial handling */ 6896 8778 #endif 6897 8779 8780 #if defined(HAVE_AES_ECB) && !defined(WOLFSSL_PIC32MZ_CRYPT) 8781 /* some hardware acceleration can gain performance from doing AES encryption 8782 * of the whole buffer at once */ 8783 if (c != p) { /* can not handle inline decryption */ 8784 while (blocks--) { 8785 IncrementGcmCounter(ctr); 8786 XMEMCPY(p, ctr, AES_BLOCK_SIZE); 8787 p += AES_BLOCK_SIZE; 8788 } 8789 8790 /* reset number of blocks and then do encryption */ 8791 blocks = sz / AES_BLOCK_SIZE; 8792 wc_AesEcbEncrypt(aes, out, out, AES_BLOCK_SIZE * blocks); 8793 xorbuf(out, c, AES_BLOCK_SIZE * blocks); 8794 c += AES_BLOCK_SIZE * blocks; 8795 } 8796 else 8797 #endif /* HAVE_AES_ECB */ 8798 while (blocks--) { 8799 IncrementGcmCounter(ctr); 8800 #ifndef WOLFSSL_PIC32MZ_CRYPT 8801 wc_AesEncrypt(aes, ctr, scratch); 8802 xorbuf(scratch, c, AES_BLOCK_SIZE); 8803 XMEMCPY(p, scratch, AES_BLOCK_SIZE); 8804 #endif 8805 p += AES_BLOCK_SIZE; 8806 c += AES_BLOCK_SIZE; 8807 } 8808 8809 if (partial != 0) { 8810 IncrementGcmCounter(ctr); 8811 wc_AesEncrypt(aes, ctr, scratch); 8812 xorbuf(scratch, c, partial); 8813 XMEMCPY(p, scratch, partial); 8814 } 8815 8816 return ret; 8817 } 8818 8819 /* Software AES - GCM Decrypt */ 8820 int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, 8821 const byte* iv, word32 ivSz, 8822 const byte* authTag, word32 authTagSz, 8823 const byte* authIn, word32 authInSz) 8824 { 8825 #ifdef WOLFSSL_AESNI 8826 int res = AES_GCM_AUTH_E; 8827 #endif 8828 6898 8829 /* argument checks */ 6899 if (aes == NULL || out == NULL || in == NULL || iv == NULL || 6900 authTag == NULL || authTagSz > AES_BLOCK_SIZE) { 8830 /* If the sz is non-zero, both in and out must be set. If sz is 0, 8831 * in and out are don't cares, as this is is the GMAC case. */ 8832 if (aes == NULL || iv == NULL || (sz != 0 && (in == NULL || out == NULL)) || 8833 authTag == NULL || authTagSz > AES_BLOCK_SIZE || authTagSz == 0) { 8834 6901 8835 return BAD_FUNC_ARG; 6902 8836 } 6903 8837 6904 ret = wc_AesGetKeySize(aes, &keySize); 6905 if (ret != 0) { 6906 return ret; 6907 } 6908 6909 #ifdef FREESCALE_LTC_AES_GCM 6910 6911 status = LTC_AES_DecryptTagGcm(LTC_BASE, in, out, sz, iv, ivSz, 6912 authIn, authInSz, (byte*)aes->key, keySize, authTag, authTagSz); 6913 6914 ret = (status == kStatus_Success) ? 0 : AES_GCM_AUTH_E; 6915 6916 #elif defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || defined(WOLFSSL_STM32F7)) 6917 6918 /* additional argument checks - STM32 HW only supports 12 byte IV */ 6919 if (ivSz != NONCE_SZ) { 6920 return BAD_FUNC_ARG; 6921 } 6922 6923 XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); 6924 XMEMCPY(initialCounter, iv, ivSz); 6925 initialCounter[AES_BLOCK_SIZE - 1] = STM32_GCM_IV_START; 6926 6927 /* Need to pad the AAD and input cipher text to a full block size since 6928 * CRYP_AES_GCM will assume these are a multiple of AES_BLOCK_SIZE. 6929 * It is okay to pad with zeros because GCM does this before GHASH already. 6930 * See NIST SP 800-38D */ 6931 6932 if ((sz % AES_BLOCK_SIZE) > 0) { 6933 inPadSz = ((sz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE; 6934 inPadded = XMALLOC(inPadSz, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 6935 if (inPadded == NULL) { 6936 return MEMORY_E; 6937 } 6938 XMEMSET(inPadded, 0, inPadSz); 6939 XMEMCPY(inPadded, in, sz); 6940 } else { 6941 inPadSz = sz; 6942 inPadded = (byte*)in; 6943 } 6944 6945 if ((authInSz % AES_BLOCK_SIZE) > 0) { 6946 authPadSz = ((authInSz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE; 6947 authInPadded = XMALLOC(authPadSz, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 6948 if (authInPadded == NULL) { 6949 if (inPadded != NULL && inPadSz != sz) 6950 XFREE(inPadded , aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 6951 return MEMORY_E; 6952 } 6953 XMEMSET(authInPadded, 0, authPadSz); 6954 XMEMCPY(authInPadded, authIn, authInSz); 6955 } else { 6956 authPadSz = authInSz; 6957 authInPadded = (byte*)authIn; 6958 } 6959 6960 #ifdef WOLFSSL_STM32_CUBEMX 6961 XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef)); 6962 switch(keySize) { 6963 case 16: /* 128-bit key */ 6964 hcryp.Init.KeySize = CRYP_KEYSIZE_128B; 6965 break; 6966 case 24: /* 192-bit key */ 6967 hcryp.Init.KeySize = CRYP_KEYSIZE_192B; 6968 break; 6969 case 32: /* 256-bit key */ 6970 hcryp.Init.KeySize = CRYP_KEYSIZE_256B; 6971 break; 6972 default: 6973 break; 6974 } 6975 hcryp.Instance = CRYP; 6976 hcryp.Init.DataType = CRYP_DATATYPE_8B; 6977 hcryp.Init.pKey = (byte*)aes->key; 6978 hcryp.Init.pInitVect = initialCounter; 6979 hcryp.Init.Header = authInPadded; 6980 hcryp.Init.HeaderSize = authInSz; 6981 6982 HAL_CRYP_Init(&hcryp); 6983 /* Use inPadded for output buffer instead of 6984 * out so that we don't overflow our size. */ 6985 status = HAL_CRYPEx_AESGCM_Decrypt(&hcryp, (byte*)inPadded, 6986 sz, inPadded, STM32_HAL_TIMEOUT); 6987 /* Compute the authTag */ 6988 if (status == HAL_OK) 6989 status = HAL_CRYPEx_AESGCM_Finish(&hcryp, sz, tag, STM32_HAL_TIMEOUT); 6990 6991 if (status != HAL_OK) 6992 ret = AES_GCM_AUTH_E; 6993 6994 HAL_CRYP_DeInit(&hcryp); 6995 #else 6996 ByteReverseWords((word32*)keyCopy, (word32*)aes->key, keySize); 6997 6998 /* Input size and auth size need to be the actual sizes, even though 6999 * they are not block aligned, because this length (in bits) is used 7000 * in the final GHASH. Use inPadded for output buffer instead of 7001 * out so that we don't overflow our size. */ 7002 status = CRYP_AES_GCM(MODE_DECRYPT, (uint8_t*)initialCounter, 7003 (uint8_t*)keyCopy, keySize * 8, 7004 (uint8_t*)inPadded, sz, 7005 (uint8_t*)authInPadded,authInSz, 7006 (uint8_t*)inPadded, tag); 7007 if (status != SUCCESS) 7008 ret = AES_GCM_AUTH_E; 7009 #endif /* WOLFSSL_STM32_CUBEMX */ 7010 7011 if (ret == 0 && ConstantCompare(authTag, tag, authTagSz) == 0) { 7012 /* Only keep the decrypted data if authTag success. */ 7013 XMEMCPY(out, inPadded, sz); 7014 ret = 0; /* success */ 7015 } 7016 7017 /* only allocate padding buffers if the inputs are not a multiple of block sz */ 7018 if (inPadded != NULL && inPadSz != sz) 7019 XFREE(inPadded , aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 7020 if (authInPadded != NULL && authPadSz != authInSz) 7021 XFREE(authInPadded, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); 7022 7023 #else 7024 7025 /* software AES GCM */ 8838 #ifdef WOLF_CRYPTO_DEV 8839 if (aes->devId != INVALID_DEVID) { 8840 int ret = wc_CryptoDev_AesGcmDecrypt(aes, out, in, sz, iv, ivSz, 8841 authTag, authTagSz, authIn, authInSz); 8842 if (ret != NOT_COMPILED_IN) 8843 return ret; 8844 } 8845 #endif 8846 8847 #if defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || \ 8848 defined(WOLFSSL_STM32F7) || \ 8849 defined(WOLFSSL_STM32L4)) 8850 8851 /* STM32 HW only supports 12 byte IV and 16 byte auth */ 8852 if (ivSz == GCM_NONCE_MID_SZ && authInSz == AES_BLOCK_SIZE) { 8853 return wc_AesGcmDecrypt_STM32(aes, out, in, sz, iv, ivSz, 8854 authTag, authTagSz, authIn, authInSz); 8855 } 8856 #endif 7026 8857 7027 8858 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES) 7028 8859 /* if async and byte count above threshold */ 8860 /* only 12-byte IV is supported in HW */ 7029 8861 if (aes->asyncDev.marker == WOLFSSL_ASYNC_MARKER_AES && 7030 sz >= WC_ASYNC_THRESH_AES_GCM) {8862 sz >= WC_ASYNC_THRESH_AES_GCM && ivSz == GCM_NONCE_MID_SZ) { 7031 8863 #if defined(HAVE_CAVIUM) 7032 /* Not yet supported, contact wolfSSL if interested in using */ 8864 #ifdef HAVE_CAVIUM_V 8865 if (authInSz == 20) { /* Nitrox V GCM is only working with 20 byte AAD */ 8866 return NitroxAesGcmDecrypt(aes, out, in, sz, 8867 (const byte*)aes->asyncKey, aes->keylen, iv, ivSz, 8868 authTag, authTagSz, authIn, authInSz); 8869 } 8870 #endif 7033 8871 #elif defined(HAVE_INTEL_QA) 7034 8872 return IntelQaSymAesGcmDecrypt(&aes->asyncDev, out, in, sz, … … 7055 8893 7056 8894 #ifdef WOLFSSL_AESNI 7057 if (haveAESNI) {7058 8895 #ifdef HAVE_INTEL_AVX2 7059 8896 if (IS_INTEL_AVX2(intel_flags)) { 7060 if (AES_GCM_decrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, 7061 ivSz, (byte*)aes->key, aes->rounds) == 0) 8897 AES_GCM_decrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, ivSz, 8898 authTagSz, (byte*)aes->key, aes->rounds, &res); 8899 if (res == 0) 7062 8900 return AES_GCM_AUTH_E; 8901 return 0; 8902 } 8903 else 8904 #endif 8905 #ifdef HAVE_INTEL_AVX1 8906 if (IS_INTEL_AVX1(intel_flags)) { 8907 AES_GCM_decrypt_avx1(in, out, authIn, iv, authTag, sz, authInSz, ivSz, 8908 authTagSz, (byte*)aes->key, aes->rounds, &res); 8909 if (res == 0) 8910 return AES_GCM_AUTH_E; 8911 return 0; 7063 8912 } 7064 8913 else 7065 8914 #endif 7066 if (AES_GCM_decrypt(in, out, authIn, iv, authTag, sz, authInSz, ivSz, 7067 (byte*)aes->key, aes->rounds) == 0) 8915 if (haveAESNI) { 8916 AES_GCM_decrypt(in, out, authIn, iv, authTag, sz, authInSz, ivSz, 8917 authTagSz, (byte*)aes->key, aes->rounds, &res); 8918 if (res == 0) 7068 8919 return AES_GCM_AUTH_E; 7069 8920 return 0; 7070 8921 } 8922 else 7071 8923 #endif 7072 7073 ctr = counter; 7074 XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); 7075 if (ivSz == NONCE_SZ) { 7076 XMEMCPY(initialCounter, iv, ivSz); 7077 initialCounter[AES_BLOCK_SIZE - 1] = 1; 7078 } 7079 else { 7080 GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); 7081 } 7082 XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE); 7083 7084 /* Calc the authTag again using the received auth data and the cipher text */ 7085 GHASH(aes, authIn, authInSz, in, sz, Tprime, sizeof(Tprime)); 7086 wc_AesEncrypt(aes, ctr, EKY0); 7087 xorbuf(Tprime, EKY0, sizeof(Tprime)); 7088 7089 if (ConstantCompare(authTag, Tprime, authTagSz) != 0) { 7090 return AES_GCM_AUTH_E; 7091 } 7092 7093 #ifdef WOLFSSL_PIC32MZ_CRYPT 7094 if (blocks) { 7095 /* use intitial IV for PIC32 HW, but don't use it below */ 7096 XMEMCPY(aes->reg, ctr, AES_BLOCK_SIZE); 7097 7098 ret = wc_Pic32AesCrypt( 7099 aes->key, aes->keylen, aes->reg, AES_BLOCK_SIZE, 7100 out, in, (blocks * AES_BLOCK_SIZE), 7101 PIC32_DECRYPTION, PIC32_ALGO_AES, PIC32_CRYPTOALGO_AES_GCM); 7102 if (ret != 0) 7103 return ret; 7104 } 7105 /* process remainder using partial handling */ 8924 { 8925 return AES_GCM_decrypt_C(aes, out, in, sz, iv, ivSz, authTag, authTagSz, 8926 authIn, authInSz); 8927 } 8928 } 7106 8929 #endif 7107 7108 while (blocks--) { 7109 IncrementGcmCounter(ctr); 7110 #ifndef WOLFSSL_PIC32MZ_CRYPT 7111 wc_AesEncrypt(aes, ctr, scratch); 7112 xorbuf(scratch, c, AES_BLOCK_SIZE); 7113 XMEMCPY(p, scratch, AES_BLOCK_SIZE); 7114 #endif 7115 p += AES_BLOCK_SIZE; 7116 c += AES_BLOCK_SIZE; 7117 } 7118 if (partial != 0) { 7119 IncrementGcmCounter(ctr); 7120 wc_AesEncrypt(aes, ctr, scratch); 7121 xorbuf(scratch, c, partial); 7122 XMEMCPY(p, scratch, partial); 7123 } 7124 7125 #endif 8930 #endif /* HAVE_AES_DECRYPT || HAVE_AESGCM_DECRYPT */ 8931 #endif /* WOLFSSL_XILINX_CRYPT */ 8932 #endif /* end of block for AESGCM implementation selection */ 8933 8934 8935 /* Common to all, abstract functions that build off of lower level AESGCM 8936 * functions */ 8937 #ifndef WC_NO_RNG 8938 8939 int wc_AesGcmSetExtIV(Aes* aes, const byte* iv, word32 ivSz) 8940 { 8941 int ret = 0; 8942 8943 if (aes == NULL || iv == NULL || 8944 (ivSz != GCM_NONCE_MIN_SZ && ivSz != GCM_NONCE_MID_SZ && 8945 ivSz != GCM_NONCE_MAX_SZ)) { 8946 8947 ret = BAD_FUNC_ARG; 8948 } 8949 8950 if (ret == 0) { 8951 XMEMCPY((byte*)aes->reg, iv, ivSz); 8952 8953 /* If the IV is 96, allow for a 2^64 invocation counter. 8954 * For any other size for the nonce, limit the invocation 8955 * counter to 32-bits. (SP 800-38D 8.3) */ 8956 aes->invokeCtr[0] = 0; 8957 aes->invokeCtr[1] = (ivSz == GCM_NONCE_MID_SZ) ? 0 : 0xFFFFFFFF; 8958 aes->nonceSz = ivSz; 8959 } 7126 8960 7127 8961 return ret; 7128 8962 } 7129 8963 7130 #endif /* HAVE_AES_DECRYPT || HAVE_AESGCM_DECRYPT */ 7131 #endif /* (WOLFSSL_XILINX_CRYPT) */ 8964 8965 int wc_AesGcmSetIV(Aes* aes, word32 ivSz, 8966 const byte* ivFixed, word32 ivFixedSz, 8967 WC_RNG* rng) 8968 { 8969 int ret = 0; 8970 8971 if (aes == NULL || rng == NULL || 8972 (ivSz != GCM_NONCE_MIN_SZ && ivSz != GCM_NONCE_MID_SZ && 8973 ivSz != GCM_NONCE_MAX_SZ) || 8974 (ivFixed == NULL && ivFixedSz != 0) || 8975 (ivFixed != NULL && ivFixedSz != AES_IV_FIXED_SZ)) { 8976 8977 ret = BAD_FUNC_ARG; 8978 } 8979 8980 if (ret == 0) { 8981 byte* iv = (byte*)aes->reg; 8982 8983 if (ivFixedSz) 8984 XMEMCPY(iv, ivFixed, ivFixedSz); 8985 8986 ret = wc_RNG_GenerateBlock(rng, iv + ivFixedSz, ivSz - ivFixedSz); 8987 } 8988 8989 if (ret == 0) { 8990 /* If the IV is 96, allow for a 2^64 invocation counter. 8991 * For any other size for the nonce, limit the invocation 8992 * counter to 32-bits. (SP 800-38D 8.3) */ 8993 aes->invokeCtr[0] = 0; 8994 aes->invokeCtr[1] = (ivSz == GCM_NONCE_MID_SZ) ? 0 : 0xFFFFFFFF; 8995 aes->nonceSz = ivSz; 8996 } 8997 8998 return ret; 8999 } 9000 9001 9002 int wc_AesGcmEncrypt_ex(Aes* aes, byte* out, const byte* in, word32 sz, 9003 byte* ivOut, word32 ivOutSz, 9004 byte* authTag, word32 authTagSz, 9005 const byte* authIn, word32 authInSz) 9006 { 9007 int ret = 0; 9008 9009 if (aes == NULL || (sz != 0 && (in == NULL || out == NULL)) || 9010 ivOut == NULL || ivOutSz != aes->nonceSz || 9011 (authIn == NULL && authInSz != 0)) { 9012 9013 ret = BAD_FUNC_ARG; 9014 } 9015 9016 if (ret == 0) { 9017 aes->invokeCtr[0]++; 9018 if (aes->invokeCtr[0] == 0) { 9019 aes->invokeCtr[1]++; 9020 if (aes->invokeCtr[1] == 0) 9021 ret = AES_GCM_OVERFLOW_E; 9022 } 9023 } 9024 9025 if (ret == 0) { 9026 XMEMCPY(ivOut, aes->reg, ivOutSz); 9027 ret = wc_AesGcmEncrypt(aes, out, in, sz, 9028 (byte*)aes->reg, ivOutSz, 9029 authTag, authTagSz, 9030 authIn, authInSz); 9031 IncCtr((byte*)aes->reg, ivOutSz); 9032 } 9033 9034 return ret; 9035 } 9036 9037 int wc_Gmac(const byte* key, word32 keySz, byte* iv, word32 ivSz, 9038 const byte* authIn, word32 authInSz, 9039 byte* authTag, word32 authTagSz, WC_RNG* rng) 9040 { 9041 Aes aes; 9042 int ret; 9043 9044 if (key == NULL || iv == NULL || (authIn == NULL && authInSz != 0) || 9045 authTag == NULL || authTagSz == 0 || rng == NULL) { 9046 9047 return BAD_FUNC_ARG; 9048 } 9049 9050 ret = wc_AesInit(&aes, NULL, INVALID_DEVID); 9051 if (ret == 0) { 9052 ret = wc_AesGcmSetKey(&aes, key, keySz); 9053 if (ret == 0) 9054 ret = wc_AesGcmSetIV(&aes, ivSz, NULL, 0, rng); 9055 if (ret == 0) 9056 ret = wc_AesGcmEncrypt_ex(&aes, NULL, NULL, 0, iv, ivSz, 9057 authTag, authTagSz, authIn, authInSz); 9058 wc_AesFree(&aes); 9059 } 9060 ForceZero(&aes, sizeof(aes)); 9061 9062 return ret; 9063 } 9064 9065 int wc_GmacVerify(const byte* key, word32 keySz, 9066 const byte* iv, word32 ivSz, 9067 const byte* authIn, word32 authInSz, 9068 const byte* authTag, word32 authTagSz) 9069 { 9070 int ret; 9071 #ifndef NO_AES_DECRYPT 9072 Aes aes; 9073 9074 if (key == NULL || iv == NULL || (authIn == NULL && authInSz != 0) || 9075 authTag == NULL || authTagSz == 0 || authTagSz > AES_BLOCK_SIZE) { 9076 9077 return BAD_FUNC_ARG; 9078 } 9079 9080 ret = wc_AesInit(&aes, NULL, INVALID_DEVID); 9081 if (ret == 0) { 9082 ret = wc_AesGcmSetKey(&aes, key, keySz); 9083 if (ret == 0) 9084 ret = wc_AesGcmDecrypt(&aes, NULL, NULL, 0, iv, ivSz, 9085 authTag, authTagSz, authIn, authInSz); 9086 wc_AesFree(&aes); 9087 } 9088 ForceZero(&aes, sizeof(aes)); 9089 #else 9090 (void)key; 9091 (void)keySz; 9092 (void)iv; 9093 (void)ivSz; 9094 (void)authIn; 9095 (void)authInSz; 9096 (void)authTag; 9097 (void)authTagSz; 9098 ret = NOT_COMPILED_IN; 9099 #endif 9100 return ret; 9101 } 9102 9103 #endif /* WC_NO_RNG */ 9104 7132 9105 7133 9106 WOLFSSL_API int wc_GmacSetKey(Gmac* gmac, const byte* key, word32 len) … … 7155 9128 int wc_AesCcmSetKey(Aes* aes, const byte* key, word32 keySz) 7156 9129 { 9130 if (!((keySz == 16) || (keySz == 24) || (keySz == 32))) 9131 return BAD_FUNC_ARG; 9132 7157 9133 return wc_AesSetKey(aes, key, keySz, NULL, AES_ENCRYPTION); 7158 9134 } 7159 9135 7160 #if defined(HAVE_COLDFIRE_SEC) 9136 #ifdef WOLFSSL_ARMASM 9137 /* implementation located in wolfcrypt/src/port/arm/armv8-aes.c */ 9138 9139 #elif defined(HAVE_COLDFIRE_SEC) 7161 9140 #error "Coldfire SEC doesn't currently support AES-CCM mode" 9141 9142 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES) 9143 /* implemented in wolfcrypt/src/port/caam_aes.c */ 7162 9144 7163 9145 #elif defined(FREESCALE_LTC) … … 7226 9208 #endif /* HAVE_AES_DECRYPT */ 7227 9209 7228 7229 /* software AES CCM */7230 9210 #else 7231 9211 9212 /* Software CCM */ 7232 9213 static void roll_x(Aes* aes, const byte* in, word32 inSz, byte* out) 7233 9214 { … … 7293 9274 7294 9275 7295 static INLINE void AesCcmCtrInc(byte* B, word32 lenSz)9276 static WC_INLINE void AesCcmCtrInc(byte* B, word32 lenSz) 7296 9277 { 7297 9278 word32 i; … … 7302 9283 } 7303 9284 9285 /* Software AES - CCM Encrypt */ 7304 9286 /* return 0 on success */ 7305 9287 int wc_AesCcmEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz, … … 7317 9299 /* sanity check on arguments */ 7318 9300 if (aes == NULL || out == NULL || in == NULL || nonce == NULL 7319 || authTag == NULL || nonceSz < 7 || nonceSz > 13) 9301 || authTag == NULL || nonceSz < 7 || nonceSz > 13 || 9302 authTagSz > AES_BLOCK_SIZE) 7320 9303 return BAD_FUNC_ARG; 7321 9304 … … 7369 9352 7370 9353 #ifdef HAVE_AES_DECRYPT 9354 /* Software AES - CCM Decrypt */ 7371 9355 int wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz, 7372 9356 const byte* nonce, word32 nonceSz, … … 7385 9369 /* sanity check on arguments */ 7386 9370 if (aes == NULL || out == NULL || in == NULL || nonce == NULL 7387 || authTag == NULL || nonceSz < 7 || nonceSz > 13) 9371 || authTag == NULL || nonceSz < 7 || nonceSz > 13 || 9372 authTagSz > AES_BLOCK_SIZE) 7388 9373 return BAD_FUNC_ARG; 7389 9374 … … 7457 9442 return result; 7458 9443 } 9444 7459 9445 #endif /* HAVE_AES_DECRYPT */ 7460 #endif /* software AES CCM */ 9446 #endif /* software CCM */ 9447 9448 /* abstract functions that call lower level AESCCM functions */ 9449 #ifndef WC_NO_RNG 9450 9451 int wc_AesCcmSetNonce(Aes* aes, const byte* nonce, word32 nonceSz) 9452 { 9453 int ret = 0; 9454 9455 if (aes == NULL || nonce == NULL || 9456 nonceSz < CCM_NONCE_MIN_SZ || nonceSz > CCM_NONCE_MAX_SZ) { 9457 9458 ret = BAD_FUNC_ARG; 9459 } 9460 9461 if (ret == 0) { 9462 XMEMCPY(aes->reg, nonce, nonceSz); 9463 aes->nonceSz = nonceSz; 9464 9465 /* Invocation counter should be 2^61 */ 9466 aes->invokeCtr[0] = 0; 9467 aes->invokeCtr[1] = 0xE0000000; 9468 } 9469 9470 return ret; 9471 } 9472 9473 9474 int wc_AesCcmEncrypt_ex(Aes* aes, byte* out, const byte* in, word32 sz, 9475 byte* ivOut, word32 ivOutSz, 9476 byte* authTag, word32 authTagSz, 9477 const byte* authIn, word32 authInSz) 9478 { 9479 int ret = 0; 9480 9481 if (aes == NULL || out == NULL || 9482 (in == NULL && sz != 0) || 9483 ivOut == NULL || 9484 (authIn == NULL && authInSz != 0) || 9485 (ivOutSz != aes->nonceSz)) { 9486 9487 ret = BAD_FUNC_ARG; 9488 } 9489 9490 if (ret == 0) { 9491 aes->invokeCtr[0]++; 9492 if (aes->invokeCtr[0] == 0) { 9493 aes->invokeCtr[1]++; 9494 if (aes->invokeCtr[1] == 0) 9495 ret = AES_CCM_OVERFLOW_E; 9496 } 9497 } 9498 9499 if (ret == 0) { 9500 ret = wc_AesCcmEncrypt(aes, out, in, sz, 9501 (byte*)aes->reg, aes->nonceSz, 9502 authTag, authTagSz, 9503 authIn, authInSz); 9504 XMEMCPY(ivOut, aes->reg, aes->nonceSz); 9505 IncCtr((byte*)aes->reg, aes->nonceSz); 9506 } 9507 9508 return ret; 9509 } 9510 9511 #endif /* WC_NO_RNG */ 7461 9512 7462 9513 #endif /* HAVE_AESCCM */ … … 7473 9524 aes->heap = heap; 7474 9525 9526 #ifdef WOLF_CRYPTO_DEV 9527 aes->devId = devId; 9528 #else 9529 (void)devId; 9530 #endif 7475 9531 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES) 7476 9532 ret = wolfAsync_DevCtxInit(&aes->asyncDev, WOLFSSL_ASYNC_MARKER_AES, 7477 9533 aes->heap, devId); 7478 #else7479 (void)devId;7480 9534 #endif /* WOLFSSL_ASYNC_CRYPT */ 9535 9536 #ifdef WOLFSSL_AFALG 9537 aes->alFd = -1; 9538 aes->rdFd = -1; 9539 #endif 9540 #if defined(WOLFSSL_DEVCRYPTO) && \ 9541 (defined(WOLFSSL_DEVCRYPTO_AES) || defined(WOLFSSL_DEVCRYPTO_CBC)) 9542 aes->ctx.cfd = -1; 9543 #endif 7481 9544 7482 9545 return ret; 7483 9546 } 9547 9548 #ifdef HAVE_PKCS11 9549 int wc_AesInit_Id(Aes* aes, unsigned char* id, int len, void* heap, int devId) 9550 { 9551 int ret = 0; 9552 9553 if (aes == NULL) 9554 ret = BAD_FUNC_ARG; 9555 if (ret == 0 && (len < 0 || len > AES_MAX_ID_LEN)) 9556 ret = BUFFER_E; 9557 9558 if (ret == 0) 9559 ret = wc_AesInit(aes, heap, devId); 9560 if (ret == 0) { 9561 XMEMCPY(aes->id, id, len); 9562 aes->idLen = len; 9563 } 9564 9565 return ret; 9566 } 9567 #endif 7484 9568 7485 9569 /* Free Aes from use with async hardware */ … … 7492 9576 wolfAsync_DevCtxFree(&aes->asyncDev, WOLFSSL_ASYNC_MARKER_AES); 7493 9577 #endif /* WOLFSSL_ASYNC_CRYPT */ 9578 #ifdef WOLFSSL_AFALG 9579 if (aes->rdFd > 0) { /* negative is error case */ 9580 close(aes->rdFd); 9581 } 9582 if (aes->alFd > 0) { 9583 close(aes->alFd); 9584 } 9585 #endif /* WOLFSSL_AFALG */ 9586 #if defined(WOLFSSL_DEVCRYPTO) && \ 9587 (defined(WOLFSSL_DEVCRYPTO_AES) || defined(WOLFSSL_DEVCRYPTO_CBC)) 9588 wc_DevCryptoFree(&aes->ctx); 9589 ForceZero((byte*)aes->devKey, AES_MAX_KEY_SIZE/WOLFSSL_BIT_SIZE); 9590 #endif 7494 9591 } 7495 9592 … … 7504 9601 7505 9602 switch (aes->rounds) { 9603 #ifdef WOLFSSL_AES_128 7506 9604 case 10: 7507 9605 *keySize = 16; 7508 9606 break; 9607 #endif 9608 #ifdef WOLFSSL_AES_192 7509 9609 case 12: 7510 9610 *keySize = 24; 7511 9611 break; 9612 #endif 9613 #ifdef WOLFSSL_AES_256 7512 9614 case 14: 7513 9615 *keySize = 32; 7514 9616 break; 9617 #endif 7515 9618 default: 7516 9619 *keySize = 0; … … 7521 9624 } 7522 9625 7523 #endif /* !WOLFSSL_ARMASM */7524 9626 #endif /* !WOLFSSL_TI_CRYPT */ 7525 9627 9628 #ifdef HAVE_AES_ECB 9629 #if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES) 9630 /* implemented in wolfcrypt/src/port/caam/caam_aes.c */ 9631 9632 #elif defined(WOLFSSL_AFALG) 9633 /* implemented in wolfcrypt/src/port/af_alg/afalg_aes.c */ 9634 9635 #elif defined(WOLFSSL_DEVCRYPTO_AES) 9636 /* implemented in wolfcrypt/src/port/devcrypt/devcrypto_aes.c */ 9637 9638 #else 9639 9640 /* Software AES - ECB */ 9641 int wc_AesEcbEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) 9642 { 9643 word32 blocks = sz / AES_BLOCK_SIZE; 9644 9645 if ((in == NULL) || (out == NULL) || (aes == NULL)) 9646 return BAD_FUNC_ARG; 9647 while (blocks>0) { 9648 wc_AesEncryptDirect(aes, out, in); 9649 out += AES_BLOCK_SIZE; 9650 in += AES_BLOCK_SIZE; 9651 sz -= AES_BLOCK_SIZE; 9652 blocks--; 9653 } 9654 return 0; 9655 } 9656 9657 9658 int wc_AesEcbDecrypt(Aes* aes, byte* out, const byte* in, word32 sz) 9659 { 9660 word32 blocks = sz / AES_BLOCK_SIZE; 9661 9662 if ((in == NULL) || (out == NULL) || (aes == NULL)) 9663 return BAD_FUNC_ARG; 9664 while (blocks>0) { 9665 wc_AesDecryptDirect(aes, out, in); 9666 out += AES_BLOCK_SIZE; 9667 in += AES_BLOCK_SIZE; 9668 sz -= AES_BLOCK_SIZE; 9669 blocks--; 9670 } 9671 return 0; 9672 } 9673 #endif 9674 #endif /* HAVE_AES_ECB */ 9675 9676 #ifdef WOLFSSL_AES_CFB 9677 /* CFB 128 9678 * 9679 * aes structure holding key to use for encryption 9680 * out buffer to hold result of encryption (must be at least as large as input 9681 * buffer) 9682 * in buffer to encrypt 9683 * sz size of input buffer 9684 * 9685 * returns 0 on success and negative error values on failure 9686 */ 9687 /* Software AES - CFB Encrypt */ 9688 int wc_AesCfbEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) 9689 { 9690 byte* tmp = NULL; 9691 byte* reg = NULL; 9692 9693 WOLFSSL_ENTER("wc_AesCfbEncrypt"); 9694 9695 if (aes == NULL || out == NULL || in == NULL) { 9696 return BAD_FUNC_ARG; 9697 } 9698 9699 if (aes->left && sz) { 9700 reg = (byte*)aes->reg + AES_BLOCK_SIZE - aes->left; 9701 } 9702 9703 /* consume any unused bytes left in aes->tmp */ 9704 tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left; 9705 while (aes->left && sz) { 9706 *(out++) = *(reg++) = *(in++) ^ *(tmp++); 9707 aes->left--; 9708 sz--; 9709 } 9710 9711 while (sz >= AES_BLOCK_SIZE) { 9712 wc_AesEncryptDirect(aes, out, (byte*)aes->reg); 9713 xorbuf(out, in, AES_BLOCK_SIZE); 9714 XMEMCPY(aes->reg, out, AES_BLOCK_SIZE); 9715 out += AES_BLOCK_SIZE; 9716 in += AES_BLOCK_SIZE; 9717 sz -= AES_BLOCK_SIZE; 9718 aes->left = 0; 9719 } 9720 9721 /* encrypt left over data */ 9722 if (sz) { 9723 wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg); 9724 aes->left = AES_BLOCK_SIZE; 9725 tmp = (byte*)aes->tmp; 9726 reg = (byte*)aes->reg; 9727 9728 while (sz--) { 9729 *(out++) = *(reg++) = *(in++) ^ *(tmp++); 9730 aes->left--; 9731 } 9732 } 9733 9734 return 0; 9735 } 9736 9737 9738 #ifdef HAVE_AES_DECRYPT 9739 /* CFB 128 9740 * 9741 * aes structure holding key to use for decryption 9742 * out buffer to hold result of decryption (must be at least as large as input 9743 * buffer) 9744 * in buffer to decrypt 9745 * sz size of input buffer 9746 * 9747 * returns 0 on success and negative error values on failure 9748 */ 9749 /* Software AES - CFB Decrypt */ 9750 int wc_AesCfbDecrypt(Aes* aes, byte* out, const byte* in, word32 sz) 9751 { 9752 byte* tmp; 9753 9754 WOLFSSL_ENTER("wc_AesCfbDecrypt"); 9755 9756 if (aes == NULL || out == NULL || in == NULL) { 9757 return BAD_FUNC_ARG; 9758 } 9759 9760 /* check if more input needs copied over to aes->reg */ 9761 if (aes->left && sz) { 9762 int size = min(aes->left, sz); 9763 XMEMCPY((byte*)aes->reg + AES_BLOCK_SIZE - aes->left, in, size); 9764 } 9765 9766 /* consume any unused bytes left in aes->tmp */ 9767 tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left; 9768 while (aes->left && sz) { 9769 *(out++) = *(in++) ^ *(tmp++); 9770 aes->left--; 9771 sz--; 9772 } 9773 9774 while (sz > AES_BLOCK_SIZE) { 9775 wc_AesEncryptDirect(aes, out, (byte*)aes->reg); 9776 xorbuf(out, in, AES_BLOCK_SIZE); 9777 XMEMCPY(aes->reg, in, AES_BLOCK_SIZE); 9778 out += AES_BLOCK_SIZE; 9779 in += AES_BLOCK_SIZE; 9780 sz -= AES_BLOCK_SIZE; 9781 aes->left = 0; 9782 } 9783 9784 /* decrypt left over data */ 9785 if (sz) { 9786 wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg); 9787 XMEMCPY(aes->reg, in, sz); 9788 aes->left = AES_BLOCK_SIZE; 9789 tmp = (byte*)aes->tmp; 9790 9791 while (sz--) { 9792 *(out++) = *(in++) ^ *(tmp++); 9793 aes->left--; 9794 } 9795 } 9796 9797 return 0; 9798 } 9799 #endif /* HAVE_AES_DECRYPT */ 9800 #endif /* WOLFSSL_AES_CFB */ 9801 7526 9802 7527 9803 #ifdef HAVE_AES_KEYWRAP 7528 9804 7529 9805 /* Initialize key wrap counter with value */ 7530 static INLINE void InitKeyWrapCounter(byte* inOutCtr, word32 value)9806 static WC_INLINE void InitKeyWrapCounter(byte* inOutCtr, word32 value) 7531 9807 { 7532 9808 int i; … … 7541 9817 7542 9818 /* Increment key wrap counter */ 7543 static INLINE void IncrementKeyWrapCounter(byte* inOutCtr)9819 static WC_INLINE void IncrementKeyWrapCounter(byte* inOutCtr) 7544 9820 { 7545 9821 int i; … … 7553 9829 7554 9830 /* Decrement key wrap counter */ 7555 static INLINE void DecrementKeyWrapCounter(byte* inOutCtr)9831 static WC_INLINE void DecrementKeyWrapCounter(byte* inOutCtr) 7556 9832 { 7557 9833 int i; … … 7836 10112 } 7837 10113 10114 #ifdef HAVE_AES_ECB 10115 /* helper function for encrypting / decrypting full buffer at once */ 10116 static int _AesXtsHelper(Aes* aes, byte* out, const byte* in, word32 sz, int dir) 10117 { 10118 word32 outSz = sz; 10119 word32 totalSz = (sz / AES_BLOCK_SIZE) * AES_BLOCK_SIZE; /* total bytes */ 10120 byte* pt = out; 10121 10122 outSz -= AES_BLOCK_SIZE; 10123 10124 while (outSz > 0) { 10125 word32 j; 10126 byte carry = 0; 10127 10128 /* multiply by shift left and propagate carry */ 10129 for (j = 0; j < AES_BLOCK_SIZE && outSz > 0; j++, outSz--) { 10130 byte tmpC; 10131 10132 tmpC = (pt[j] >> 7) & 0x01; 10133 pt[j+AES_BLOCK_SIZE] = ((pt[j] << 1) + carry) & 0xFF; 10134 carry = tmpC; 10135 } 10136 if (carry) { 10137 pt[AES_BLOCK_SIZE] ^= GF_XTS; 10138 } 10139 10140 pt += AES_BLOCK_SIZE; 10141 } 10142 10143 xorbuf(out, in, totalSz); 10144 if (dir == AES_ENCRYPTION) { 10145 return wc_AesEcbEncrypt(aes, out, out, totalSz); 10146 } 10147 else { 10148 return wc_AesEcbDecrypt(aes, out, out, totalSz); 10149 } 10150 } 10151 #endif /* HAVE_AES_ECB */ 10152 7838 10153 7839 10154 /* AES with XTS mode. (XTS) XEX encryption with Tweak and cipher text Stealing. … … 7849 10164 * returns 0 on success 7850 10165 */ 10166 /* Software AES - XTS Encrypt */ 7851 10167 int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, 7852 10168 const byte* i, word32 iSz) … … 7856 10172 Aes *aes, *tweak; 7857 10173 7858 if (xaes == NULL || out == NULL ) {10174 if (xaes == NULL || out == NULL || in == NULL) { 7859 10175 return BAD_FUNC_ARG; 7860 10176 } … … 7867 10183 } 7868 10184 7869 if (in == NULL && sz > 0) {7870 return BAD_FUNC_ARG;7871 }7872 7873 10185 if (blocks > 0) { 7874 10186 byte tmp[AES_BLOCK_SIZE]; 7875 10187 10188 XMEMSET(tmp, 0, AES_BLOCK_SIZE); /* set to 0's in case of improper AES 10189 * key setup passed to encrypt direct*/ 10190 7876 10191 wc_AesEncryptDirect(tweak, tmp, i); 10192 10193 #ifdef HAVE_AES_ECB 10194 /* encrypt all of buffer at once when possible */ 10195 if (in != out) { /* can not handle inline */ 10196 XMEMCPY(out, tmp, AES_BLOCK_SIZE); 10197 if ((ret = _AesXtsHelper(aes, out, in, sz, AES_ENCRYPTION)) != 0) { 10198 return ret; 10199 } 10200 } 10201 #endif 7877 10202 7878 10203 while (blocks > 0) { … … 7881 10206 byte buf[AES_BLOCK_SIZE]; 7882 10207 10208 #ifdef HAVE_AES_ECB 10209 if (in == out) { /* check for if inline */ 10210 #endif 7883 10211 XMEMCPY(buf, in, AES_BLOCK_SIZE); 7884 10212 xorbuf(buf, tmp, AES_BLOCK_SIZE); 7885 10213 wc_AesEncryptDirect(aes, out, buf); 10214 #ifdef HAVE_AES_ECB 10215 } 10216 #endif 7886 10217 xorbuf(out, tmp, AES_BLOCK_SIZE); 7887 10218 7888 /* multiply by shift left and prop ogate carry */10219 /* multiply by shift left and propagate carry */ 7889 10220 for (j = 0; j < AES_BLOCK_SIZE; j++) { 7890 10221 byte tmpC; … … 7897 10228 tmp[0] ^= GF_XTS; 7898 10229 } 7899 carry = 0;7900 10230 7901 10231 in += AES_BLOCK_SIZE; … … 7942 10272 * returns 0 on success 7943 10273 */ 10274 /* Software AES - XTS Decrypt */ 7944 10275 int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, 7945 10276 const byte* i, word32 iSz) … … 7949 10280 Aes *aes, *tweak; 7950 10281 7951 if (xaes == NULL || out == NULL ) {10282 if (xaes == NULL || out == NULL || in == NULL) { 7952 10283 return BAD_FUNC_ARG; 7953 10284 } … … 7957 10288 7958 10289 if (iSz < AES_BLOCK_SIZE) { 7959 return BAD_FUNC_ARG;7960 }7961 7962 if (in == NULL && sz > 0) {7963 10290 return BAD_FUNC_ARG; 7964 10291 } … … 7970 10297 byte stl = (sz % AES_BLOCK_SIZE); 7971 10298 10299 XMEMSET(tmp, 0, AES_BLOCK_SIZE); /* set to 0's in case of improper AES 10300 * key setup passed to decrypt direct*/ 10301 7972 10302 wc_AesEncryptDirect(tweak, tmp, i); 7973 10303 … … 7978 10308 } 7979 10309 10310 #ifdef HAVE_AES_ECB 10311 /* decrypt all of buffer at once when possible */ 10312 if (in != out) { /* can not handle inline */ 10313 XMEMCPY(out, tmp, AES_BLOCK_SIZE); 10314 if ((ret = _AesXtsHelper(aes, out, in, sz, AES_DECRYPTION)) != 0) { 10315 return ret; 10316 } 10317 } 10318 #endif 10319 7980 10320 while (blocks > 0) { 7981 10321 byte buf[AES_BLOCK_SIZE]; 7982 10322 10323 #ifdef HAVE_AES_ECB 10324 if (in == out) { /* check for if inline */ 10325 #endif 7983 10326 XMEMCPY(buf, in, AES_BLOCK_SIZE); 7984 10327 xorbuf(buf, tmp, AES_BLOCK_SIZE); 7985 10328 wc_AesDecryptDirect(aes, out, buf); 10329 #ifdef HAVE_AES_ECB 10330 } 10331 #endif 7986 10332 xorbuf(out, tmp, AES_BLOCK_SIZE); 7987 10333 7988 /* multiply by shift left and prop ogate carry */10334 /* multiply by shift left and propagate carry */ 7989 10335 for (j = 0; j < AES_BLOCK_SIZE; j++) { 7990 10336 byte tmpC; … … 8010 10356 byte tmp2[AES_BLOCK_SIZE]; 8011 10357 8012 /* multiply by shift left and prop ogate carry */10358 /* multiply by shift left and propagate carry */ 8013 10359 for (j = 0; j < AES_BLOCK_SIZE; j++) { 8014 10360 byte tmpC;
Note:
See TracChangeset
for help on using the changeset viewer.