- Timestamp:
- Feb 7, 2019, 8:36:33 AM (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
asp3_tinet_ecnl_arm/trunk/wolfssl-3.12.2/wolfcrypt/src/sha256.c
r352 r372 29 29 #include <wolfssl/wolfcrypt/settings.h> 30 30 31 #if !defined(NO_SHA256) 31 #if !defined(NO_SHA256) && !defined(WOLFSSL_ARMASM) 32 33 #if defined(HAVE_FIPS) && \ 34 defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2) 35 36 /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */ 37 #define FIPS_NO_WRAPPERS 38 39 #ifdef USE_WINDOWS_API 40 #pragma code_seg(".fipsA$d") 41 #pragma const_seg(".fipsB$d") 42 #endif 43 #endif 32 44 33 45 #include <wolfssl/wolfcrypt/sha256.h> … … 36 48 37 49 /* fips wrapper calls, user can call direct */ 38 #ifdef HAVE_FIPS 50 #if defined(HAVE_FIPS) && \ 51 (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION < 2)) 39 52 40 53 int wc_InitSha256(wc_Sha256* sha) … … 59 72 return BAD_FUNC_ARG; 60 73 } 74 75 if (data == NULL && len == 0) { 76 /* valid, but do nothing */ 77 return 0; 78 } 79 61 80 return Sha256Update_fips(sha, data, len); 62 81 } … … 74 93 } 75 94 76 #else /* else build without fips */95 #else /* else build without fips, or for FIPS v2 */ 77 96 78 97 … … 90 109 #endif 91 110 111 #ifdef WOLFSSL_DEVCRYPTO_HASH 112 #include <wolfssl/wolfcrypt/port/devcrypto/wc_devcrypto.h> 113 #endif 114 115 92 116 93 117 #if defined(USE_INTEL_SPEEDUP) 118 #if defined(__GNUC__) && ((__GNUC__ < 4) || \ 119 (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) 120 #undef NO_AVX2_SUPPORT 121 #define NO_AVX2_SUPPORT 122 #endif 123 #if defined(__clang__) && ((__clang_major__ < 3) || \ 124 (__clang_major__ == 3 && __clang_minor__ <= 5)) 125 #define NO_AVX2_SUPPORT 126 #elif defined(__clang__) && defined(NO_AVX2_SUPPORT) 127 #undef NO_AVX2_SUPPORT 128 #endif 129 94 130 #define HAVE_INTEL_AVX1 131 #ifndef NO_AVX2_SUPPORT 95 132 #define HAVE_INTEL_AVX2 133 #endif 96 134 #endif /* USE_INTEL_SPEEDUP */ 97 135 … … 101 139 102 140 103 static INLINE void AddLength(wc_Sha256* sha256, word32 len); 104 105 #if !defined(WOLFSSL_PIC32MZ_HASH) && !defined(STM32_HASH)141 #if !defined(WOLFSSL_PIC32MZ_HASH) && !defined(STM32_HASH_SHA2) && \ 142 (!defined(WOLFSSL_IMX6_CAAM) || defined(NO_IMX6_CAAM_HASH)) && \ 143 !defined(WOLFSSL_AFALG_HASH) && !defined(WOLFSSL_DEVCRYPTO_HASH) 106 144 static int InitSha256(wc_Sha256* sha256) 107 145 { … … 151 189 152 190 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) 153 Transform (); Function prototype191 Transform_Sha256(); Function prototype 154 192 #else 155 Transform () { }193 Transform_Sha256() { } 156 194 int Sha256Final() { 157 195 Save/Recover XMM, YMM … … 172 210 #define XMM Instructions/inline asm 173 211 174 int Transform () {212 int Transform_Sha256() { 175 213 Stitched Message Sched/Round 176 214 } … … 180 218 #define YMM Instructions/inline asm 181 219 182 int Transform () {220 int Transform_Sha256() { 183 221 More granural Stitched Message Sched/Round 184 222 } … … 193 231 194 232 /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha256 */ 195 static int Transform (wc_Sha256* sha256);233 static int Transform_Sha256(wc_Sha256* sha256); 196 234 #if defined(HAVE_INTEL_AVX1) 197 static int Transform_AVX1(wc_Sha256 *sha256); 235 static int Transform_Sha256_AVX1(wc_Sha256 *sha256); 236 static int Transform_Sha256_AVX1_Len(wc_Sha256* sha256, word32 len); 198 237 #endif 199 238 #if defined(HAVE_INTEL_AVX2) 200 static int Transform_AVX2(wc_Sha256 *sha256); 201 static int Transform_AVX1_RORX(wc_Sha256 *sha256); 202 #endif 203 static int (*Transform_p)(wc_Sha256* sha256) /* = _Transform */; 239 static int Transform_Sha256_AVX2(wc_Sha256 *sha256); 240 static int Transform_Sha256_AVX2_Len(wc_Sha256* sha256, word32 len); 241 #ifdef HAVE_INTEL_RORX 242 static int Transform_Sha256_AVX1_RORX(wc_Sha256 *sha256); 243 static int Transform_Sha256_AVX1_RORX_Len(wc_Sha256* sha256, word32 len); 244 static int Transform_Sha256_AVX2_RORX(wc_Sha256 *sha256); 245 static int Transform_Sha256_AVX2_RORX_Len(wc_Sha256* sha256, word32 len); 246 #endif 247 #endif 248 static int (*Transform_Sha256_p)(wc_Sha256* sha256); 249 /* = _Transform_Sha256 */ 250 static int (*Transform_Sha256_Len_p)(wc_Sha256* sha256, word32 len); 251 /* = NULL */ 204 252 static int transform_check = 0; 205 253 static word32 intel_flags; 206 #define XTRANSFORM(S, B) (*Transform_p)((S)) 254 #define XTRANSFORM(S) (*Transform_Sha256_p)((S)) 255 #define XTRANSFORM_LEN(S, L) (*Transform_Sha256_Len_p)((S),(L)) 207 256 208 257 static void Sha256_SetTransform(void) … … 214 263 intel_flags = cpuid_get_flags(); 215 264 216 #if defined(HAVE_INTEL_AVX2) 217 if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_BMI2(intel_flags)) { 265 #ifdef HAVE_INTEL_AVX2 266 if (IS_INTEL_AVX2(intel_flags)) { 267 #ifdef HAVE_INTEL_RORX 268 if (IS_INTEL_BMI2(intel_flags)) { 269 Transform_Sha256_p = Transform_Sha256_AVX2_RORX; 270 Transform_Sha256_Len_p = Transform_Sha256_AVX2_RORX_Len; 271 } 272 else 273 #endif 218 274 if (1) 219 Transform_p = Transform_AVX1_RORX; 220 else 221 Transform_p = Transform_AVX2; 275 { 276 Transform_Sha256_p = Transform_Sha256_AVX2; 277 Transform_Sha256_Len_p = Transform_Sha256_AVX2_Len; 278 } 279 #ifdef HAVE_INTEL_RORX 280 else { 281 Transform_Sha256_p = Transform_Sha256_AVX1_RORX; 282 Transform_Sha256_Len_p = Transform_Sha256_AVX1_RORX_Len; 283 } 284 #endif 222 285 } 223 286 else 224 287 #endif 225 #if defined(HAVE_INTEL_AVX1)226 if ( 1) {227 Transform_ p = ((IS_INTEL_AVX1(intel_flags)) ? Transform_AVX1 :228 Transform);288 #ifdef HAVE_INTEL_AVX1 289 if (IS_INTEL_AVX1(intel_flags)) { 290 Transform_Sha256_p = Transform_Sha256_AVX1; 291 Transform_Sha256_Len_p = Transform_Sha256_AVX1_Len; 229 292 } 230 293 else 231 294 #endif 232 Transform_p = Transform; 295 { 296 Transform_Sha256_p = Transform_Sha256; 297 Transform_Sha256_Len_p = NULL; 298 } 233 299 234 300 transform_check = 1; 235 301 } 236 237 /* Dummy for saving MM_REGs on behalf of Transform */238 #if defined(HAVE_INTEL_AVX2) && !defined(HAVE_INTEL_AVX1)239 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\240 "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15")241 #elif defined(HAVE_INTEL_AVX1)242 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\243 "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\244 "xmm11","xmm12","xmm13","xmm14","xmm15")245 #endif246 302 247 303 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) … … 289 345 #endif 290 346 291 #define XTRANSFORM(S, B) Transform((S), (B)) 347 #define XTRANSFORM(S) Transform_Sha256((S)) 348 #define XTRANSFORM_LEN(S,L) Transform_Sha256_Len((S),(L)) 292 349 293 350 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) … … 316 373 } 317 374 318 static int Transform (wc_Sha256* sha256, byte* buf)375 static int Transform_Sha256(wc_Sha256* sha256) 319 376 { 320 377 int ret = wolfSSL_CryptHwMutexLock(); 321 378 if (ret == 0) { 322 379 #ifdef FREESCALE_MMCAU_CLASSIC_SHA 323 cau_sha256_hash_n( buf, 1, sha256->digest);380 cau_sha256_hash_n((byte*)sha256->buffer, 1, sha256->digest); 324 381 #else 325 MMCAU_SHA256_HashN( buf, 1, sha256->digest);382 MMCAU_SHA256_HashN((byte*)sha256->buffer, 1, sha256->digest); 326 383 #endif 327 384 wolfSSL_CryptHwMutexUnLock(); … … 333 390 #include <wolfssl/wolfcrypt/port/pic32/pic32mz-crypt.h> 334 391 335 #elif defined(STM32_HASH) 336 337 /* 338 * STM32F2/F4/F7 hardware SHA256 support through the HASH_* API's from the 339 * Standard Peripheral Library or CubeMX (See note in README). 340 */ 341 342 /* STM32 register size, bytes */ 343 #ifdef WOLFSSL_STM32_CUBEMX 344 #define SHA256_REG_SIZE SHA256_BLOCK_SIZE 345 #else 346 #define SHA256_REG_SIZE 4 347 /* STM32 struct notes: 348 * sha256->buffer = first 4 bytes used to hold partial block if needed 349 * sha256->buffLen = num bytes currently stored in sha256->buffer 350 * sha256->loLen = num bytes that have been written to STM32 FIFO 351 */ 352 #endif 353 #define SHA256_HW_TIMEOUT 0xFF 392 #elif defined(STM32_HASH_SHA2) 393 394 /* Supports CubeMX HAL or Standard Peripheral Library */ 354 395 355 396 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) … … 358 399 return BAD_FUNC_ARG; 359 400 360 sha256->heap = heap; 361 XMEMSET(sha256->buffer, 0, sizeof(sha256->buffer)); 362 sha256->buffLen = 0; 363 sha256->loLen = 0; 364 sha256->hiLen = 0; 365 366 /* initialize HASH peripheral */ 367 #ifdef WOLFSSL_STM32_CUBEMX 368 HAL_HASH_DeInit(&sha256->hashHandle); 369 sha256->hashHandle.Init.DataType = HASH_DATATYPE_8B; 370 if (HAL_HASH_Init(&sha256->hashHandle) != HAL_OK) { 371 return ASYNC_INIT_E; 372 } 373 /* reset the hash control register */ 374 /* required because Cube MX is not clearing algo bits */ 375 HASH->CR &= ~HASH_CR_ALGO; 376 #else 377 HASH_DeInit(); 378 379 /* reset the hash control register */ 380 HASH->CR &= ~ (HASH_CR_ALGO | HASH_CR_DATATYPE | HASH_CR_MODE); 381 382 /* configure algo used, algo mode, datatype */ 383 HASH->CR |= (HASH_AlgoSelection_SHA256 | HASH_AlgoMode_HASH 384 | HASH_DataType_8b); 385 386 /* reset HASH processor */ 387 HASH->CR |= HASH_CR_INIT; 388 #endif 389 401 (void)devId; 402 (void)heap; 403 404 wc_Stm32_Hash_Init(&sha256->stmCtx); 390 405 return 0; 391 406 } … … 394 409 { 395 410 int ret = 0; 396 byte* local;397 411 398 412 if (sha256 == NULL || (data == NULL && len > 0)) { … … 400 414 } 401 415 402 /* do block size increments */ 403 local = (byte*)sha256->buffer; 404 405 /* check that internal buffLen is valid */ 406 if (sha256->buffLen >= SHA256_REG_SIZE) 407 return BUFFER_E; 408 409 while (len) { 410 word32 add = min(len, SHA256_REG_SIZE - sha256->buffLen); 411 XMEMCPY(&local[sha256->buffLen], data, add); 412 413 sha256->buffLen += add; 414 data += add; 415 len -= add; 416 417 if (sha256->buffLen == SHA256_REG_SIZE) { 418 #ifdef WOLFSSL_STM32_CUBEMX 419 if (HAL_HASHEx_SHA256_Accumulate( 420 &sha256->hashHandle, local, SHA256_REG_SIZE) != HAL_OK) { 421 ret = ASYNC_OP_E; 422 } 423 #else 424 HASH_DataIn(*(uint32_t*)local); 425 #endif 426 427 AddLength(sha256, SHA256_REG_SIZE); 428 sha256->buffLen = 0; 429 } 416 ret = wolfSSL_CryptHwMutexLock(); 417 if (ret == 0) { 418 ret = wc_Stm32_Hash_Update(&sha256->stmCtx, 419 HASH_AlgoSelection_SHA256, data, len); 420 wolfSSL_CryptHwMutexUnLock(); 430 421 } 431 422 return ret; … … 436 427 int ret = 0; 437 428 438 if (sha256 == NULL || hash == NULL) 429 if (sha256 == NULL || hash == NULL) { 439 430 return BAD_FUNC_ARG; 440 441 #ifdef WOLFSSL_STM32_CUBEMX 442 if (HAL_HASHEx_SHA256_Start(&sha256->hashHandle, 443 (byte*)sha256->buffer, sha256->buffLen, 444 (byte*)sha256->digest, SHA256_HW_TIMEOUT) != HAL_OK) { 445 ret = ASYNC_OP_E; 446 } 447 #else 448 __IO uint16_t nbvalidbitsdata = 0; 449 450 /* finish reading any trailing bytes into FIFO */ 451 if (sha256->buffLen > 0) { 452 HASH_DataIn(*(uint32_t*)sha256->buffer); 453 AddLength(sha256, sha256->buffLen); 454 } 455 456 /* calculate number of valid bits in last word of input data */ 457 nbvalidbitsdata = 8 * (sha256->loLen % SHA256_REG_SIZE); 458 459 /* configure number of valid bits in last word of the data */ 460 HASH_SetLastWordValidBitsNbr(nbvalidbitsdata); 461 462 /* start HASH processor */ 463 HASH_StartDigest(); 464 465 /* wait until Busy flag == RESET */ 466 while (HASH_GetFlagStatus(HASH_FLAG_BUSY) != RESET) {} 467 468 /* read message digest */ 469 sha256->digest[0] = HASH->HR[0]; 470 sha256->digest[1] = HASH->HR[1]; 471 sha256->digest[2] = HASH->HR[2]; 472 sha256->digest[3] = HASH->HR[3]; 473 sha256->digest[4] = HASH->HR[4]; 474 sha256->digest[5] = HASH_DIGEST->HR[5]; 475 sha256->digest[6] = HASH_DIGEST->HR[6]; 476 sha256->digest[7] = HASH_DIGEST->HR[7]; 477 478 ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE); 479 #endif /* WOLFSSL_STM32_CUBEMX */ 480 481 XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE); 482 483 (void)wc_InitSha256_ex(sha256, sha256->heap, INVALID_DEVID); 431 } 432 433 ret = wolfSSL_CryptHwMutexLock(); 434 if (ret == 0) { 435 ret = wc_Stm32_Hash_Final(&sha256->stmCtx, 436 HASH_AlgoSelection_SHA256, hash, WC_SHA256_DIGEST_SIZE); 437 wolfSSL_CryptHwMutexUnLock(); 438 } 439 440 (void)wc_InitSha256(sha256); /* reset state */ 484 441 485 442 return ret; 486 443 } 444 445 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH) 446 /* functions defined in wolfcrypt/src/port/caam/caam_sha256.c */ 447 448 #elif defined(WOLFSSL_AFALG_HASH) 449 /* implemented in wolfcrypt/src/port/af_alg/afalg_hash.c */ 450 451 #elif defined(WOLFSSL_DEVCRYPTO_HASH) 452 /* implemented in wolfcrypt/src/port/devcrypto/devcrypt_hash.c */ 487 453 488 454 #else … … 500 466 if (ret != 0) 501 467 return ret; 468 469 #ifdef WOLFSSL_SMALL_STACK_CACHE 470 sha256->W = NULL; 471 #endif 502 472 503 473 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256) … … 511 481 } 512 482 #endif /* End Hardware Acceleration */ 513 514 #ifndef SAVE_XMM_YMM515 #define SAVE_XMM_YMM516 #endif517 483 518 484 #ifdef NEED_SOFT_SHA256 … … 544 510 #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) 545 511 546 #define RND(a,b,c,d,e,f,g,h,i) \ 547 t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \ 548 t1 = Sigma0((a)) + Maj((a), (b), (c)); \ 549 (d) += t0; \ 550 (h) = t0 + t1; 512 #define a(i) S[(0-i) & 7] 513 #define b(i) S[(1-i) & 7] 514 #define c(i) S[(2-i) & 7] 515 #define d(i) S[(3-i) & 7] 516 #define e(i) S[(4-i) & 7] 517 #define f(i) S[(5-i) & 7] 518 #define g(i) S[(6-i) & 7] 519 #define h(i) S[(7-i) & 7] 520 521 #define RND(j) \ 522 t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + W[i+j]; \ 523 t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \ 524 d(j) += t0; \ 525 h(j) = t0 + t1 551 526 552 527 #ifndef XTRANSFORM 553 #define XTRANSFORM(S, B) Transform((S)) 528 #define XTRANSFORM(S) Transform_Sha256((S)) 529 #define XTRANSFORM_LEN(S,L) Transform_Sha256_Len((S),(L)) 554 530 #endif 555 531 556 static int Transform (wc_Sha256* sha256)532 static int Transform_Sha256(wc_Sha256* sha256) 557 533 { 558 534 word32 S[8], t0, t1; 559 535 int i; 560 536 561 #ifdef WOLFSSL_SMALL_STACK 537 #ifdef WOLFSSL_SMALL_STACK_CACHE 538 word32* W = sha256->W; 539 if (W == NULL) { 540 W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL, 541 DYNAMIC_TYPE_DIGEST); 542 if (W == NULL) 543 return MEMORY_E; 544 sha256->W = W; 545 } 546 #elif defined(WOLFSSL_SMALL_STACK) 562 547 word32* W; 563 564 548 W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL, 565 549 DYNAMIC_TYPE_TMP_BUFFER); … … 580 564 W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16]; 581 565 566 #ifdef USE_SLOW_SHA256 567 /* not unrolled - ~2k smaller and ~25% slower */ 582 568 for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) { 583 RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0); 584 RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1); 585 RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2); 586 RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3); 587 RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4); 588 RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5); 589 RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6); 590 RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7); 591 } 569 int j; 570 for (j = 0; j < 8; j++) { /* braces needed here for macros {} */ 571 RND(j); 572 } 573 } 574 #else 575 /* partially loop unrolled */ 576 for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) { 577 RND(0); RND(1); RND(2); RND(3); 578 RND(4); RND(5); RND(6); RND(7); 579 } 580 #endif /* USE_SLOW_SHA256 */ 592 581 593 582 /* Add the working vars back into digest state[] */ … … 596 585 } 597 586 598 #if def WOLFSSL_SMALL_STACK587 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE) 599 588 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); 600 589 #endif 601 602 590 return 0; 603 591 } … … 606 594 607 595 608 #if defined(XTRANSFORM) || defined(STM32_HASH) 609 static INLINE void AddLength(wc_Sha256* sha256, word32 len) 596 #ifdef XTRANSFORM 597 598 static WC_INLINE void AddLength(wc_Sha256* sha256, word32 len) 610 599 { 611 600 word32 tmp = sha256->loLen; … … 613 602 sha256->hiLen++; /* carry low to high */ 614 603 } 615 #endif 616 617 618 #ifdef XTRANSFORM 619 620 static INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len) 604 605 static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len) 621 606 { 622 607 int ret = 0; … … 625 610 if (sha256 == NULL || (data == NULL && len > 0)) { 626 611 return BAD_FUNC_ARG; 612 } 613 614 if (data == NULL && len == 0) { 615 /* valid, but do nothing */ 616 return 0; 627 617 } 628 618 … … 642 632 return BUFFER_E; 643 633 644 SAVE_XMM_YMM; /* for Intel AVX */ 645 646 while (len) { 634 if (sha256->buffLen > 0) { 647 635 word32 add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen); 648 636 XMEMCPY(&local[sha256->buffLen], data, add); … … 662 650 } 663 651 #endif 664 ret = XTRANSFORM(sha256, local); 665 if (ret != 0) { 652 ret = XTRANSFORM(sha256); 653 if (ret == 0) { 654 AddLength(sha256, WC_SHA256_BLOCK_SIZE); 655 sha256->buffLen = 0; 656 } 657 else 658 len = 0; 659 } 660 } 661 662 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 663 if (Transform_Sha256_Len_p != NULL) { 664 word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1); 665 666 if (blocksLen > 0) { 667 AddLength(sha256, blocksLen); 668 sha256->data = data; 669 /* Byte reversal performed in function if required. */ 670 XTRANSFORM_LEN(sha256, blocksLen); 671 data += blocksLen; 672 len -= blocksLen; 673 } 674 } 675 else 676 #endif 677 #if !defined(LITTLE_ENDIAN_ORDER) || defined(FREESCALE_MMCAU_SHA) || \ 678 defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 679 { 680 word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1); 681 682 AddLength(sha256, blocksLen); 683 while (len >= WC_SHA256_BLOCK_SIZE) { 684 XMEMCPY(local, data, WC_SHA256_BLOCK_SIZE); 685 686 data += WC_SHA256_BLOCK_SIZE; 687 len -= WC_SHA256_BLOCK_SIZE; 688 689 /* Byte reversal performed in function if required. */ 690 ret = XTRANSFORM(sha256); 691 if (ret != 0) 666 692 break; 667 693 } 668 669 AddLength(sha256, WC_SHA256_BLOCK_SIZE); 670 sha256->buffLen = 0; 694 } 695 #else 696 { 697 word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1); 698 699 AddLength(sha256, blocksLen); 700 while (len >= WC_SHA256_BLOCK_SIZE) { 701 XMEMCPY(local, data, WC_SHA256_BLOCK_SIZE); 702 703 data += WC_SHA256_BLOCK_SIZE; 704 len -= WC_SHA256_BLOCK_SIZE; 705 706 ByteReverseWords(sha256->buffer, sha256->buffer, 707 WC_SHA256_BLOCK_SIZE); 708 ret = XTRANSFORM(sha256); 709 if (ret != 0) 710 break; 671 711 } 712 } 713 #endif 714 715 if (len > 0) { 716 XMEMCPY(local, data, len); 717 sha256->buffLen = len; 672 718 } 673 719 … … 680 726 } 681 727 682 static INLINE int Sha256Final(wc_Sha256* sha256)728 static WC_INLINE int Sha256Final(wc_Sha256* sha256) 683 729 { 684 730 … … 689 735 return BAD_FUNC_ARG; 690 736 } 691 692 SAVE_XMM_YMM; /* for Intel AVX */693 737 694 738 AddLength(sha256, sha256->buffLen); /* before adding pads */ … … 713 757 } 714 758 715 ret = XTRANSFORM(sha256 , local);759 ret = XTRANSFORM(sha256); 716 760 if (ret != 0) 717 761 return ret; … … 755 799 #endif 756 800 757 return XTRANSFORM(sha256, local); 801 return XTRANSFORM(sha256); 802 } 803 804 int wc_Sha256FinalRaw(wc_Sha256* sha256, byte* hash) 805 { 806 #ifdef LITTLE_ENDIAN_ORDER 807 word32 digest[WC_SHA256_DIGEST_SIZE / sizeof(word32)]; 808 #endif 809 810 if (sha256 == NULL || hash == NULL) { 811 return BAD_FUNC_ARG; 812 } 813 814 #ifdef LITTLE_ENDIAN_ORDER 815 ByteReverseWords((word32*)digest, (word32*)sha256->digest, 816 WC_SHA256_DIGEST_SIZE); 817 XMEMCPY(hash, digest, WC_SHA256_DIGEST_SIZE); 818 #else 819 XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE); 820 #endif 821 822 return 0; 758 823 } 759 824 … … 792 857 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 793 858 794 #define _DigestToReg(S0, S1, S2, S3, S4, S5, S6, S7) \ 795 "leaq %[digest], %%r8\n\t" \ 796 "movl (%%r8), %"#S0"\n\t" \ 797 "movl 4(%%r8), %"#S1"\n\t" \ 798 "movl 8(%%r8), %"#S2"\n\t" \ 799 "movl 12(%%r8), %"#S3"\n\t" \ 800 "movl 16(%%r8), %"#S4"\n\t" \ 801 "movl 20(%%r8), %"#S5"\n\t" \ 802 "movl 24(%%r8), %"#S6"\n\t" \ 803 "movl 28(%%r8), %"#S7"\n\t" 804 805 #define _RegToDigest(S0, S1, S2, S3, S4, S5, S6, S7) \ 806 "leaq %[digest], %%r8\n\t" \ 807 "addl %"#S0", (%%r8)\n\t" \ 808 "addl %"#S1", 4(%%r8)\n\t" \ 809 "addl %"#S2", 8(%%r8)\n\t" \ 810 "addl %"#S3", 12(%%r8)\n\t" \ 811 "addl %"#S4", 16(%%r8)\n\t" \ 812 "addl %"#S5", 20(%%r8)\n\t" \ 813 "addl %"#S6", 24(%%r8)\n\t" \ 814 "addl %"#S7", 28(%%r8)\n\t" 815 816 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 817 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 818 819 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 820 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 821 822 823 #define S_0 %r15d 824 #define S_1 %r10d 825 #define S_2 %r11d 826 #define S_3 %r12d 827 #define S_4 %r13d 828 #define S_5 %r14d 829 #define S_6 %ebx 830 #define S_7 %r9d 831 832 #define SSE_REGs "%edi", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15" 859 #define _LOAD_DIGEST() \ 860 "movl (%[sha256]), %%r8d \n\t" \ 861 "movl 4(%[sha256]), %%r9d \n\t" \ 862 "movl 8(%[sha256]), %%r10d\n\t" \ 863 "movl 12(%[sha256]), %%r11d\n\t" \ 864 "movl 16(%[sha256]), %%r12d\n\t" \ 865 "movl 20(%[sha256]), %%r13d\n\t" \ 866 "movl 24(%[sha256]), %%r14d\n\t" \ 867 "movl 28(%[sha256]), %%r15d\n\t" 868 869 #define _STORE_ADD_DIGEST() \ 870 "addl %%r8d , (%[sha256])\n\t" \ 871 "addl %%r9d , 4(%[sha256])\n\t" \ 872 "addl %%r10d, 8(%[sha256])\n\t" \ 873 "addl %%r11d, 12(%[sha256])\n\t" \ 874 "addl %%r12d, 16(%[sha256])\n\t" \ 875 "addl %%r13d, 20(%[sha256])\n\t" \ 876 "addl %%r14d, 24(%[sha256])\n\t" \ 877 "addl %%r15d, 28(%[sha256])\n\t" 878 879 #define _ADD_DIGEST() \ 880 "addl (%[sha256]), %%r8d \n\t" \ 881 "addl 4(%[sha256]), %%r9d \n\t" \ 882 "addl 8(%[sha256]), %%r10d\n\t" \ 883 "addl 12(%[sha256]), %%r11d\n\t" \ 884 "addl 16(%[sha256]), %%r12d\n\t" \ 885 "addl 20(%[sha256]), %%r13d\n\t" \ 886 "addl 24(%[sha256]), %%r14d\n\t" \ 887 "addl 28(%[sha256]), %%r15d\n\t" 888 889 #define _STORE_DIGEST() \ 890 "movl %%r8d , (%[sha256])\n\t" \ 891 "movl %%r9d , 4(%[sha256])\n\t" \ 892 "movl %%r10d, 8(%[sha256])\n\t" \ 893 "movl %%r11d, 12(%[sha256])\n\t" \ 894 "movl %%r12d, 16(%[sha256])\n\t" \ 895 "movl %%r13d, 20(%[sha256])\n\t" \ 896 "movl %%r14d, 24(%[sha256])\n\t" \ 897 "movl %%r15d, 28(%[sha256])\n\t" 898 899 #define LOAD_DIGEST() \ 900 _LOAD_DIGEST() 901 902 #define STORE_ADD_DIGEST() \ 903 _STORE_ADD_DIGEST() 904 905 #define ADD_DIGEST() \ 906 _ADD_DIGEST() 907 908 #define STORE_DIGEST() \ 909 _STORE_DIGEST() 910 911 912 #define S_0 %r8d 913 #define S_1 %r9d 914 #define S_2 %r10d 915 #define S_3 %r11d 916 #define S_4 %r12d 917 #define S_5 %r13d 918 #define S_6 %r14d 919 #define S_7 %r15d 920 921 #define L1 "%%edx" 922 #define L2 "%%ecx" 923 #define L3 "%%eax" 924 #define L4 "%%ebx" 925 #define WK "%%rsp" 926 927 #define WORK_REGS "eax", "ebx", "ecx", "edx" 928 #define STATE_REGS "r8","r9","r10","r11","r12","r13","r14","r15" 929 #define XMM_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", \ 930 "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13" 833 931 834 932 #if defined(HAVE_INTEL_RORX) 835 #define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i) \ 836 "# edx = e>>>6\n\t" \ 837 "rorx $6, %"#e", %%edx\n\t" 838 839 #define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i) \ 840 "# edi = e>>>11\n\t" \ 841 "rorx $11, %"#e",%%edi\n\t" \ 842 "# edi = (e>>11) ^ (e>>6)\n\t" \ 843 "xorl %%edx, %%edi\n\t" \ 844 "# edx = e>>>25\n\t" \ 845 "rorx $25, %"#e", %%edx\n\t" 846 847 #define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i) \ 848 "# esi = f\n\t" \ 849 "movl %"#f", %%esi\n\t" \ 850 "# esi = f ^ g\n\t" \ 851 "xorl %"#g", %%esi\n\t" \ 852 "# edx = Sigma1(e)\n\t" \ 853 "xorl %%edi, %%edx\n\t" \ 854 "# esi = (f ^ g) & e\n\t" \ 855 "andl %"#e", %%esi\n\t" \ 856 "# esi = Ch(e,f,g)\n\t" \ 857 "xorl %"#g", %%esi\n\t" 858 859 #define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i) \ 860 "# h += w_k\n\t" \ 861 "leaq %[W_K], %%r8\n\t" \ 862 "addl ("#i")*4(%%r8), %"#h"\n\t" \ 863 "# h = h + w_k + Sigma1(e)\n\t" \ 864 "addl %%edx, %"#h"\n\t" \ 865 "# r8d = a>>>2\n\t" \ 866 "rorx $2, %"#a", %%r8d\n\t" \ 867 "# edi = a>>>13\n\t" \ 868 "rorx $13, %"#a", %%edi\n\t" 869 870 #define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i) \ 871 "# edx = a>>22\n\t" \ 872 "rorx $22, %"#a", %%edx\n\t" \ 873 "# edi = (a>>>2) ^ (a>>>13)\n\t" \ 874 "xorl %%r8d, %%edi\n\t" \ 875 "# edx = Sigma0(a)\n\t" \ 876 "xorl %%edi, %%edx\n\t" 877 878 #define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i) \ 879 "# edi = b\n\t" \ 880 "movl %"#b", %%edi\n\t" \ 881 "# edi = a | b\n\t" \ 882 "orl %"#a", %%edi\n\t" \ 883 "# edi = (a | b) & c\n\t" \ 884 "andl %"#c", %%edi\n\t" \ 885 "# r8d = b\n\t" \ 886 "movl %"#b", %%r8d\n\t" 887 888 #define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i) \ 889 "# h += Ch(e,f,g)\n\t" \ 890 "addl %%esi, %"#h"\n\t" \ 891 "# r8d = b & a\n\t" \ 892 "andl %"#a", %%r8d\n\t" \ 893 "# r8d = Maj(a,b,c)\n\t" \ 894 "orl %%edi, %%r8d\n\t" 895 896 #define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i) \ 897 "# d += h + w_k + Sigma1(e) + Ch(e,f,g)\n\t" \ 933 #define RND_STEP_RORX_0_1(a, b, c, d, e, f, g, h, i) \ 934 /* L3 = f */ \ 935 "movl %" #f ", " L3 "\n\t" \ 936 /* L2 = e>>>11 */ \ 937 "rorx $11, %" #e ", " L2 "\n\t" \ 938 /* h += w_k */ \ 939 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 940 941 #define RND_STEP_RORX_0_2(a, b, c, d, e, f, g, h, i) \ 942 /* L2 = (e>>>6) ^ (e>>>11) */ \ 943 "xorl " L1 ", " L2 "\n\t" \ 944 /* L3 = f ^ g */ \ 945 "xorl %" #g ", " L3 "\n\t" \ 946 /* L1 = e>>>25 */ \ 947 "rorx $25, %" #e ", " L1 "\n\t" \ 948 949 #define RND_STEP_RORX_0_3(a, b, c, d, e, f, g, h, i) \ 950 /* L3 = (f ^ g) & e */ \ 951 "andl %" #e ", " L3 "\n\t" \ 952 /* L1 = Sigma1(e) */ \ 953 "xorl " L2 ", " L1 "\n\t" \ 954 /* L2 = a>>>13 */ \ 955 "rorx $13, %" #a ", " L2 "\n\t" \ 956 957 #define RND_STEP_RORX_0_4(a, b, c, d, e, f, g, h, i) \ 958 /* h += Sigma1(e) */ \ 959 "addl " L1 ", %" #h "\n\t" \ 960 /* L1 = a>>>2 */ \ 961 "rorx $2, %" #a ", " L1 "\n\t" \ 962 /* L3 = Ch(e,f,g) */ \ 963 "xorl %" #g ", " L3 "\n\t" \ 964 965 #define RND_STEP_RORX_0_5(a, b, c, d, e, f, g, h, i) \ 966 /* L2 = (a>>>2) ^ (a>>>13) */ \ 967 "xorl " L1 ", " L2 "\n\t" \ 968 /* L1 = a>>>22 */ \ 969 "rorx $22, %" #a ", " L1 "\n\t" \ 970 /* h += Ch(e,f,g) */ \ 971 "addl " L3 ", %" #h "\n\t" \ 972 973 #define RND_STEP_RORX_0_6(a, b, c, d, e, f, g, h, i) \ 974 /* L1 = Sigma0(a) */ \ 975 "xorl " L2 ", " L1 "\n\t" \ 976 /* L3 = b */ \ 977 "movl %" #b ", " L3 "\n\t" \ 978 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 979 "addl %" #h ", %" #d "\n\t" \ 980 981 #define RND_STEP_RORX_0_7(a, b, c, d, e, f, g, h, i) \ 982 /* L3 = a ^ b */ \ 983 "xorl %" #a ", " L3 "\n\t" \ 984 /* h += Sigma0(a) */ \ 985 "addl " L1 ", %" #h "\n\t" \ 986 /* L4 = (a ^ b) & (b ^ c) */ \ 987 "andl " L3 ", " L4 "\n\t" \ 988 989 #define RND_STEP_RORX_0_8(a, b, c, d, e, f, g, h, i) \ 990 /* L4 = Maj(a,b,c) */ \ 991 "xorl %" #b ", " L4 "\n\t" \ 992 /* L1 = d>>>6 (= e>>>6 next RND) */ \ 993 "rorx $6, %" #d ", " L1 "\n\t" \ 994 /* h += Maj(a,b,c) */ \ 995 "addl " L4 ", %" #h "\n\t" \ 996 997 #define RND_STEP_RORX_1_1(a, b, c, d, e, f, g, h, i) \ 998 /* L4 = f */ \ 999 "movl %" #f ", " L4 "\n\t" \ 1000 /* L2 = e>>>11 */ \ 1001 "rorx $11, %" #e ", " L2 "\n\t" \ 1002 /* h += w_k */ \ 1003 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 1004 1005 #define RND_STEP_RORX_1_2(a, b, c, d, e, f, g, h, i) \ 1006 /* L2 = (e>>>6) ^ (e>>>11) */ \ 1007 "xorl " L1 ", " L2 "\n\t" \ 1008 /* L4 = f ^ g */ \ 1009 "xorl %" #g ", " L4 "\n\t" \ 1010 /* L1 = e>>>25 */ \ 1011 "rorx $25, %" #e ", " L1 "\n\t" \ 1012 1013 #define RND_STEP_RORX_1_3(a, b, c, d, e, f, g, h, i) \ 1014 /* L4 = (f ^ g) & e */ \ 1015 "andl %" #e ", " L4 "\n\t" \ 1016 /* L1 = Sigma1(e) */ \ 1017 "xorl " L2 ", " L1 "\n\t" \ 1018 /* L2 = a>>>13 */ \ 1019 "rorx $13, %" #a ", " L2 "\n\t" \ 1020 1021 #define RND_STEP_RORX_1_4(a, b, c, d, e, f, g, h, i) \ 1022 /* h += Sigma1(e) */ \ 1023 "addl " L1 ", %" #h "\n\t" \ 1024 /* L1 = a>>>2 */ \ 1025 "rorx $2, %" #a ", " L1 "\n\t" \ 1026 /* L4 = Ch(e,f,g) */ \ 1027 "xorl %" #g ", " L4 "\n\t" \ 1028 1029 #define RND_STEP_RORX_1_5(a, b, c, d, e, f, g, h, i) \ 1030 /* L2 = (a>>>2) ^ (a>>>13) */ \ 1031 "xorl " L1 ", " L2 "\n\t" \ 1032 /* L1 = a>>>22 */ \ 1033 "rorx $22, %" #a ", " L1 "\n\t" \ 1034 /* h += Ch(e,f,g) */ \ 1035 "addl " L4 ", %" #h "\n\t" \ 1036 1037 #define RND_STEP_RORX_1_6(a, b, c, d, e, f, g, h, i) \ 1038 /* L1 = Sigma0(a) */ \ 1039 "xorl " L2 ", " L1 "\n\t" \ 1040 /* L4 = b */ \ 1041 "movl %" #b ", " L4 "\n\t" \ 1042 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 898 1043 "addl %"#h", %"#d"\n\t" \ 899 "addl %"#h", %%r8d\n\t" \ 900 "addl %%edx, %%r8d\n\t" \ 901 "movl %%r8d, %"#h"\n\t" 1044 1045 #define RND_STEP_RORX_1_7(a, b, c, d, e, f, g, h, i) \ 1046 /* L4 = a ^ b */ \ 1047 "xorl %" #a ", " L4 "\n\t" \ 1048 /* h += Sigma0(a) */ \ 1049 "addl " L1 ", %" #h "\n\t" \ 1050 /* L3 = (a ^ b) & (b ^ c) */ \ 1051 "andl " L4 ", " L3 "\n\t" \ 1052 1053 #define RND_STEP_RORX_1_8(a, b, c, d, e, f, g, h, i) \ 1054 /* L3 = Maj(a,b,c) */ \ 1055 "xorl %" #b ", " L3 "\n\t" \ 1056 /* L1 = d>>>6 (= e>>>6 next RND) */ \ 1057 "rorx $6, %" #d ", " L1 "\n\t" \ 1058 /* h += Maj(a,b,c) */ \ 1059 "addl " L3 ", %" #h "\n\t" \ 1060 1061 #define _RND_RORX_X_0(a, b, c, d, e, f, g, h, i) \ 1062 /* L1 = e>>>6 */ \ 1063 "rorx $6, %" #e ", " L1 "\n\t" \ 1064 /* L2 = e>>>11 */ \ 1065 "rorx $11, %" #e ", " L2 "\n\t" \ 1066 /* Prev RND: h += Maj(a,b,c) */ \ 1067 "addl " L3 ", %" #a "\n\t" \ 1068 /* h += w_k */ \ 1069 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 1070 /* L3 = f */ \ 1071 "movl %" #f ", " L3 "\n\t" \ 1072 /* L2 = (e>>>6) ^ (e>>>11) */ \ 1073 "xorl " L1 ", " L2 "\n\t" \ 1074 /* L3 = f ^ g */ \ 1075 "xorl %" #g ", " L3 "\n\t" \ 1076 /* L1 = e>>>25 */ \ 1077 "rorx $25, %" #e ", " L1 "\n\t" \ 1078 /* L1 = Sigma1(e) */ \ 1079 "xorl " L2 ", " L1 "\n\t" \ 1080 /* L3 = (f ^ g) & e */ \ 1081 "andl %" #e ", " L3 "\n\t" \ 1082 /* h += Sigma1(e) */ \ 1083 "addl " L1 ", %" #h "\n\t" \ 1084 /* L1 = a>>>2 */ \ 1085 "rorx $2, %" #a ", " L1 "\n\t" \ 1086 /* L2 = a>>>13 */ \ 1087 "rorx $13, %" #a ", " L2 "\n\t" \ 1088 /* L3 = Ch(e,f,g) */ \ 1089 "xorl %" #g ", " L3 "\n\t" \ 1090 /* L2 = (a>>>2) ^ (a>>>13) */ \ 1091 "xorl " L1 ", " L2 "\n\t" \ 1092 /* L1 = a>>>22 */ \ 1093 "rorx $22, %" #a ", " L1 "\n\t" \ 1094 /* h += Ch(e,f,g) */ \ 1095 "addl " L3 ", %" #h "\n\t" \ 1096 /* L1 = Sigma0(a) */ \ 1097 "xorl " L2 ", " L1 "\n\t" \ 1098 /* L3 = b */ \ 1099 "movl %" #b ", " L3 "\n\t" \ 1100 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 1101 "addl %" #h ", %" #d "\n\t" \ 1102 /* L3 = a ^ b */ \ 1103 "xorl %" #a ", " L3 "\n\t" \ 1104 /* L4 = (a ^ b) & (b ^ c) */ \ 1105 "andl " L3 ", " L4 "\n\t" \ 1106 /* h += Sigma0(a) */ \ 1107 "addl " L1 ", %" #h "\n\t" \ 1108 /* L4 = Maj(a,b,c) */ \ 1109 "xorl %" #b ", " L4 "\n\t" \ 1110 1111 #define _RND_RORX_X_1(a, b, c, d, e, f, g, h, i) \ 1112 /* L1 = e>>>6 */ \ 1113 "rorx $6, %" #e ", " L1 "\n\t" \ 1114 /* L2 = e>>>11 */ \ 1115 "rorx $11, %" #e ", " L2 "\n\t" \ 1116 /* Prev RND: h += Maj(a,b,c) */ \ 1117 "addl " L4 ", %" #a "\n\t" \ 1118 /* h += w_k */ \ 1119 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 1120 /* L4 = f */ \ 1121 "movl %" #f ", " L4 "\n\t" \ 1122 /* L2 = (e>>>6) ^ (e>>>11) */ \ 1123 "xorl " L1 ", " L2 "\n\t" \ 1124 /* L4 = f ^ g */ \ 1125 "xorl %" #g ", " L4 "\n\t" \ 1126 /* L1 = e>>>25 */ \ 1127 "rorx $25, %" #e ", " L1 "\n\t" \ 1128 /* L1 = Sigma1(e) */ \ 1129 "xorl " L2 ", " L1 "\n\t" \ 1130 /* L4 = (f ^ g) & e */ \ 1131 "andl %" #e ", " L4 "\n\t" \ 1132 /* h += Sigma1(e) */ \ 1133 "addl " L1 ", %" #h "\n\t" \ 1134 /* L1 = a>>>2 */ \ 1135 "rorx $2, %" #a ", " L1 "\n\t" \ 1136 /* L2 = a>>>13 */ \ 1137 "rorx $13, %" #a ", " L2 "\n\t" \ 1138 /* L4 = Ch(e,f,g) */ \ 1139 "xorl %" #g ", " L4 "\n\t" \ 1140 /* L2 = (a>>>2) ^ (a>>>13) */ \ 1141 "xorl " L1 ", " L2 "\n\t" \ 1142 /* L1 = a>>>22 */ \ 1143 "rorx $22, %" #a ", " L1 "\n\t" \ 1144 /* h += Ch(e,f,g) */ \ 1145 "addl " L4 ", %" #h "\n\t" \ 1146 /* L1 = Sigma0(a) */ \ 1147 "xorl " L2 ", " L1 "\n\t" \ 1148 /* L4 = b */ \ 1149 "movl %" #b ", " L4 "\n\t" \ 1150 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 1151 "addl %" #h ", %" #d "\n\t" \ 1152 /* L4 = a ^ b */ \ 1153 "xorl %" #a ", " L4 "\n\t" \ 1154 /* L2 = (a ^ b) & (b ^ c) */ \ 1155 "andl " L4 ", " L3 "\n\t" \ 1156 /* h += Sigma0(a) */ \ 1157 "addl " L1 ", %" #h "\n\t" \ 1158 /* L3 = Maj(a,b,c) */ \ 1159 "xorl %" #b ", " L3 "\n\t" \ 1160 1161 1162 #define RND_RORX_X_0(a,b,c,d,e,f,g,h,i) \ 1163 _RND_RORX_X_0(a,b,c,d,e,f,g,h,i) 1164 #define RND_RORX_X_1(a,b,c,d,e,f,g,h,i) \ 1165 _RND_RORX_X_1(a,b,c,d,e,f,g,h,i) 1166 1167 #define RND_RORX_X4(a,b,c,d,e,f,g,h,i) \ 1168 RND_RORX_X_0(a,b,c,d,e,f,g,h,i+0) \ 1169 RND_RORX_X_1(h,a,b,c,d,e,f,g,i+1) \ 1170 RND_RORX_X_0(g,h,a,b,c,d,e,f,i+2) \ 1171 RND_RORX_X_1(f,g,h,a,b,c,d,e,i+3) 1172 902 1173 #endif /* HAVE_INTEL_RORX */ 903 1174 904 #define RND_STEP_1(a,b,c,d,e,f,g,h,i) \ 905 "movl %"#e", %%edx\n\t" \ 906 "# edx = e>>>6\n\t" \ 907 "roll $26, %%edx\n\t" \ 908 "movl %"#e", %%edi\n\t" 909 910 #define RND_STEP_2(a,b,c,d,e,f,g,h,i) \ 911 "# edi = e>>>11\n\t" \ 912 "roll $21, %%edi\n\t" \ 913 "# edi = (e>>11) ^ (e>>6)\n\t" \ 914 "xorl %%edx, %%edi\n\t" \ 915 "# edx = e\n\t" \ 916 "movl %"#e", %%edx\n\t" \ 917 "# edx = e>>>25\n\t" \ 918 "roll $7, %%edx\n\t" 919 920 #define RND_STEP_3(a,b,c,d,e,f,g,h,i) \ 921 "# esi = f\n\t" \ 922 "movl %"#f", %%esi\n\t" \ 923 "# esi = f ^ g\n\t" \ 924 "xorl %"#g", %%esi\n\t" \ 925 "# edx = Sigma1(e)\n\t" \ 926 "xorl %%edi, %%edx\n\t" \ 927 "# esi = (f ^ g) & e\n\t" \ 928 "andl %"#e", %%esi\n\t" \ 929 "# esi = Ch(e,f,g)\n\t" \ 930 "xorl %"#g", %%esi\n\t" 931 932 #define RND_STEP_4(a,b,c,d,e,f,g,h,i) \ 933 "# h += w_k\n\t" \ 934 "leaq %[W_K], %%r8\n\t" \ 935 "addl ("#i")*4(%%r8), %"#h"\n\t" \ 936 "# h = h + w_k + Sigma1(e)\n\t" \ 937 "addl %%edx, %"#h"\n\t" \ 938 "# r8d = a\n\t" \ 939 "movl %"#a", %%r8d\n\t" \ 940 "# r8d = a>>>2\n\t" \ 941 "roll $30, %%r8d\n\t" \ 942 "# edi = a\n\t" \ 943 "movl %"#a", %%edi\n\t" \ 944 "# edi = a>>>13\n\t" \ 945 "roll $19, %%edi\n\t" \ 946 "# edx = a\n\t" \ 947 "movl %"#a", %%edx\n\t" 948 949 #define RND_STEP_5(a,b,c,d,e,f,g,h,i) \ 950 "# edx = a>>>22\n\t" \ 951 "roll $10, %%edx\n\t" \ 952 "# edi = (a>>>2) ^ (a>>>13)\n\t" \ 953 "xorl %%r8d, %%edi\n\t" \ 954 "# edx = Sigma0(a)\n\t" \ 955 "xorl %%edi, %%edx\n\t" 956 957 #define RND_STEP_6(a,b,c,d,e,f,g,h,i) \ 958 "# edi = b\n\t" \ 959 "movl %"#b", %%edi\n\t" \ 960 "# edi = a | b\n\t" \ 961 "orl %"#a", %%edi\n\t" \ 962 "# edi = (a | b) & c\n\t" \ 963 "andl %"#c", %%edi\n\t" \ 964 "# r8d = b\n\t" \ 965 "movl %"#b", %%r8d\n\t" 966 967 #define RND_STEP_7(a,b,c,d,e,f,g,h,i) \ 968 "# h += Ch(e,f,g)\n\t" \ 969 "addl %%esi, %"#h"\n\t" \ 970 "#r8d = b & a\n\t" \ 971 "andl %"#a", %%r8d\n\t" \ 972 "# r8d = Maj(a,b,c)\n\t" \ 973 "orl %%edi, %%r8d\n\t" 974 975 #define RND_STEP_8(a,b,c,d,e,f,g,h,i) \ 976 "# d += h + w_k + Sigma1(e) + Ch(e,f,g)\n\t" \ 1175 #define RND_STEP_0_1(a,b,c,d,e,f,g,h,i) \ 1176 /* L1 = e>>>14 */ \ 1177 "rorl $14, " L1 "\n\t" \ 1178 1179 #define RND_STEP_0_2(a,b,c,d,e,f,g,h,i) \ 1180 /* L3 = b */ \ 1181 "movl %" #b ", " L3 "\n\t" \ 1182 /* L2 = f */ \ 1183 "movl %" #f ", " L2 "\n\t" \ 1184 /* h += w_k */ \ 1185 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 1186 /* L2 = f ^ g */ \ 1187 "xorl %" #g ", " L2 "\n\t" \ 1188 1189 #define RND_STEP_0_3(a,b,c,d,e,f,g,h,i) \ 1190 /* L1 = (e>>>14) ^ e */ \ 1191 "xorl %" #e ", " L1 "\n\t" \ 1192 /* L2 = (f ^ g) & e */ \ 1193 "andl %" #e ", " L2 "\n\t" \ 1194 1195 #define RND_STEP_0_4(a,b,c,d,e,f,g,h,i) \ 1196 /* L1 = ((e>>>14) ^ e) >>> 5 */ \ 1197 "rorl $5, " L1 "\n\t" \ 1198 /* L2 = Ch(e,f,g) */ \ 1199 "xorl %" #g ", " L2 "\n\t" \ 1200 /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \ 1201 "xorl %" #e ", " L1 "\n\t" \ 1202 /* h += Ch(e,f,g) */ \ 1203 "addl " L2 ", %" #h "\n\t" \ 1204 1205 #define RND_STEP_0_5(a,b,c,d,e,f,g,h,i) \ 1206 /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \ 1207 "rorl $6, " L1 "\n\t" \ 1208 /* L3 = a ^ b (= b ^ c of next RND) */ \ 1209 "xorl %" #a ", " L3 "\n\t" \ 1210 /* h = h + w_k + Sigma1(e) */ \ 1211 "addl " L1 ", %" #h "\n\t" \ 1212 /* L2 = a */ \ 1213 "movl %" #a ", " L2 "\n\t" \ 1214 1215 #define RND_STEP_0_6(a,b,c,d,e,f,g,h,i) \ 1216 /* L3 = (a ^ b) & (b ^ c) */ \ 1217 "andl " L3 ", " L4 "\n\t" \ 1218 /* L2 = a>>>9 */ \ 1219 "rorl $9, " L2 "\n\t" \ 1220 /* L2 = (a>>>9) ^ a */ \ 1221 "xorl %" #a ", " L2 "\n\t" \ 1222 /* L1 = Maj(a,b,c) */ \ 1223 "xorl %" #b ", " L4 "\n\t" \ 1224 1225 #define RND_STEP_0_7(a,b,c,d,e,f,g,h,i) \ 1226 /* L2 = ((a>>>9) ^ a) >>> 11 */ \ 1227 "rorl $11, " L2 "\n\t" \ 1228 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 1229 "addl %" #h ", %" #d "\n\t" \ 1230 /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \ 1231 "xorl %" #a ", " L2 "\n\t" \ 1232 /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \ 1233 "addl " L4 ", %" #h "\n\t" \ 1234 1235 #define RND_STEP_0_8(a,b,c,d,e,f,g,h,i) \ 1236 /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \ 1237 "rorl $2, " L2 "\n\t" \ 1238 /* L1 = d (e of next RND) */ \ 1239 "movl %" #d ", " L1 "\n\t" \ 1240 /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ 1241 "addl " L2 ", %" #h "\n\t" \ 1242 1243 #define RND_STEP_1_1(a,b,c,d,e,f,g,h,i) \ 1244 /* L1 = e>>>14 */ \ 1245 "rorl $14, " L1 "\n\t" \ 1246 1247 #define RND_STEP_1_2(a,b,c,d,e,f,g,h,i) \ 1248 /* L3 = b */ \ 1249 "movl %" #b ", " L4 "\n\t" \ 1250 /* L2 = f */ \ 1251 "movl %" #f ", " L2 "\n\t" \ 1252 /* h += w_k */ \ 1253 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 1254 /* L2 = f ^ g */ \ 1255 "xorl %" #g ", " L2 "\n\t" \ 1256 1257 #define RND_STEP_1_3(a,b,c,d,e,f,g,h,i) \ 1258 /* L1 = (e>>>14) ^ e */ \ 1259 "xorl %" #e ", " L1 "\n\t" \ 1260 /* L2 = (f ^ g) & e */ \ 1261 "andl %" #e ", " L2 "\n\t" \ 1262 1263 #define RND_STEP_1_4(a,b,c,d,e,f,g,h,i) \ 1264 /* L1 = ((e>>>14) ^ e) >>> 5 */ \ 1265 "rorl $5, " L1 "\n\t" \ 1266 /* L2 = Ch(e,f,g) */ \ 1267 "xorl %" #g ", " L2 "\n\t" \ 1268 /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \ 1269 "xorl %" #e ", " L1 "\n\t" \ 1270 /* h += Ch(e,f,g) */ \ 1271 "addl " L2 ", %" #h "\n\t" \ 1272 1273 #define RND_STEP_1_5(a,b,c,d,e,f,g,h,i) \ 1274 /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \ 1275 "rorl $6, " L1 "\n\t" \ 1276 /* L4 = a ^ b (= b ^ c of next RND) */ \ 1277 "xorl %" #a ", " L4 "\n\t" \ 1278 /* h = h + w_k + Sigma1(e) */ \ 1279 "addl " L1 ", %" #h "\n\t" \ 1280 /* L2 = a */ \ 1281 "movl %" #a ", " L2 "\n\t" \ 1282 1283 #define RND_STEP_1_6(a,b,c,d,e,f,g,h,i) \ 1284 /* L3 = (a ^ b) & (b ^ c) */ \ 1285 "andl " L4 ", " L3 "\n\t" \ 1286 /* L2 = a>>>9 */ \ 1287 "rorl $9, " L2 "\n\t" \ 1288 /* L2 = (a>>>9) ^ a */ \ 1289 "xorl %" #a ", " L2 "\n\t" \ 1290 /* L1 = Maj(a,b,c) */ \ 1291 "xorl %" #b ", " L3 "\n\t" \ 1292 1293 #define RND_STEP_1_7(a,b,c,d,e,f,g,h,i) \ 1294 /* L2 = ((a>>>9) ^ a) >>> 11 */ \ 1295 "rorl $11, " L2 "\n\t" \ 1296 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 1297 "addl %" #h ", %" #d "\n\t" \ 1298 /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \ 1299 "xorl %" #a ", " L2 "\n\t" \ 1300 /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \ 1301 "addl " L3 ", %" #h "\n\t" \ 1302 1303 #define RND_STEP_1_8(a,b,c,d,e,f,g,h,i) \ 1304 /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \ 1305 "rorl $2, " L2 "\n\t" \ 1306 /* L1 = d (e of next RND) */ \ 1307 "movl %" #d ", " L1 "\n\t" \ 1308 /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ 1309 "addl " L2 ", %" #h "\n\t" \ 1310 1311 #define _RND_ALL_0(a,b,c,d,e,f,g,h,i) \ 1312 /* h += w_k */ \ 1313 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 1314 /* L2 = f */ \ 1315 "movl %" #f ", " L2 "\n\t" \ 1316 /* L3 = b */ \ 1317 "movl %" #b ", " L3 "\n\t" \ 1318 /* L2 = f ^ g */ \ 1319 "xorl %" #g ", " L2 "\n\t" \ 1320 /* L1 = e>>>14 */ \ 1321 "rorl $14, " L1 "\n\t" \ 1322 /* L2 = (f ^ g) & e */ \ 1323 "andl %" #e ", " L2 "\n\t" \ 1324 /* L1 = (e>>>14) ^ e */ \ 1325 "xorl %" #e ", " L1 "\n\t" \ 1326 /* L2 = Ch(e,f,g) */ \ 1327 "xorl %" #g ", " L2 "\n\t" \ 1328 /* L1 = ((e>>>14) ^ e) >>> 5 */ \ 1329 "rorl $5, " L1 "\n\t" \ 1330 /* h += Ch(e,f,g) */ \ 1331 "addl " L2 ", %" #h "\n\t" \ 1332 /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \ 1333 "xorl %" #e ", " L1 "\n\t" \ 1334 /* L3 = a ^ b */ \ 1335 "xorl %" #a ", " L3 "\n\t" \ 1336 /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \ 1337 "rorl $6, " L1 "\n\t" \ 1338 /* L2 = a */ \ 1339 "movl %" #a ", " L2 "\n\t" \ 1340 /* h = h + w_k + Sigma1(e) */ \ 1341 "addl " L1 ", %" #h "\n\t" \ 1342 /* L2 = a>>>9 */ \ 1343 "rorl $9, " L2 "\n\t" \ 1344 /* L3 = (a ^ b) & (b ^ c) */ \ 1345 "andl " L3 ", " L4 "\n\t" \ 1346 /* L2 = (a>>>9) ^ a */ \ 1347 "xorl %" #a ", " L2 "\n\t" \ 1348 /* L1 = Maj(a,b,c) */ \ 1349 "xorl %" #b ", " L4 "\n\t" \ 1350 /* L2 = ((a>>>9) ^ a) >>> 11 */ \ 1351 "rorl $11, " L2 "\n\t" \ 1352 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 1353 "addl %" #h ", %" #d "\n\t" \ 1354 /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \ 1355 "xorl %" #a ", " L2 "\n\t" \ 1356 /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \ 1357 "addl " L4 ", %" #h "\n\t" \ 1358 /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \ 1359 "rorl $2, " L2 "\n\t" \ 1360 /* L1 = d (e of next RND) */ \ 1361 "movl %" #d ", " L1 "\n\t" \ 1362 /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ 1363 "addl " L2 ", %" #h "\n\t" \ 1364 1365 #define _RND_ALL_1(a,b,c,d,e,f,g,h,i) \ 1366 /* h += w_k */ \ 1367 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 1368 /* L2 = f */ \ 1369 "movl %" #f ", " L2 "\n\t" \ 1370 /* L3 = b */ \ 1371 "movl %" #b ", " L4 "\n\t" \ 1372 /* L2 = f ^ g */ \ 1373 "xorl %" #g ", " L2 "\n\t" \ 1374 /* L1 = e>>>14 */ \ 1375 "rorl $14, " L1 "\n\t" \ 1376 /* L2 = (f ^ g) & e */ \ 1377 "andl %" #e ", " L2 "\n\t" \ 1378 /* L1 = (e>>>14) ^ e */ \ 1379 "xorl %" #e ", " L1 "\n\t" \ 1380 /* L2 = Ch(e,f,g) */ \ 1381 "xorl %" #g ", " L2 "\n\t" \ 1382 /* L1 = ((e>>>14) ^ e) >>> 5 */ \ 1383 "rorl $5, " L1 "\n\t" \ 1384 /* h += Ch(e,f,g) */ \ 1385 "addl " L2 ", %" #h "\n\t" \ 1386 /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \ 1387 "xorl %" #e ", " L1 "\n\t" \ 1388 /* L3 = a ^ b */ \ 1389 "xorl %" #a ", " L4 "\n\t" \ 1390 /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \ 1391 "rorl $6, " L1 "\n\t" \ 1392 /* L2 = a */ \ 1393 "movl %" #a ", " L2 "\n\t" \ 1394 /* h = h + w_k + Sigma1(e) */ \ 1395 "addl " L1 ", %" #h "\n\t" \ 1396 /* L2 = a>>>9 */ \ 1397 "rorl $9, " L2 "\n\t" \ 1398 /* L3 = (a ^ b) & (b ^ c) */ \ 1399 "andl " L4 ", " L3 "\n\t" \ 1400 /* L2 = (a>>>9) ^ a */ \ 1401 "xorl %" #a", " L2 "\n\t" \ 1402 /* L1 = Maj(a,b,c) */ \ 1403 "xorl %" #b ", " L3 "\n\t" \ 1404 /* L2 = ((a>>>9) ^ a) >>> 11 */ \ 1405 "rorl $11, " L2 "\n\t" \ 1406 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 977 1407 "addl %"#h", %"#d"\n\t" \ 978 "# r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c)\n\t" \ 979 "addl %"#h", %%r8d\n\t" \ 980 "# r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c)\n\t" \ 981 "addl %%edx, %%r8d\n\t" \ 982 "# h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c)\n\t" \ 983 "movl %%r8d, %"#h"\n\t" 984 985 #define RND_X(a,b,c,d,e,f,g,h,i) \ 986 RND_STEP_1(a,b,c,d,e,f,g,h,i) \ 987 RND_STEP_2(a,b,c,d,e,f,g,h,i) \ 988 RND_STEP_3(a,b,c,d,e,f,g,h,i) \ 989 RND_STEP_4(a,b,c,d,e,f,g,h,i) \ 990 RND_STEP_5(a,b,c,d,e,f,g,h,i) \ 991 RND_STEP_6(a,b,c,d,e,f,g,h,i) \ 992 RND_STEP_7(a,b,c,d,e,f,g,h,i) \ 993 RND_STEP_8(a,b,c,d,e,f,g,h,i) 994 995 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) 996 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i) 997 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i) 998 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i) 999 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i) 1000 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i) 1001 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i) 1002 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i) 1003 1004 1005 #define RND_1_3(a,b,c,d,e,f,g,h,i) \ 1006 RND_STEP_1(a,b,c,d,e,f,g,h,i) \ 1007 RND_STEP_2(a,b,c,d,e,f,g,h,i) \ 1008 RND_STEP_3(a,b,c,d,e,f,g,h,i) 1009 1010 #define RND_4_6(a,b,c,d,e,f,g,h,i) \ 1011 RND_STEP_4(a,b,c,d,e,f,g,h,i) \ 1012 RND_STEP_5(a,b,c,d,e,f,g,h,i) \ 1013 RND_STEP_6(a,b,c,d,e,f,g,h,i) 1014 1015 #define RND_7_8(a,b,c,d,e,f,g,h,i) \ 1016 RND_STEP_7(a,b,c,d,e,f,g,h,i) \ 1017 RND_STEP_8(a,b,c,d,e,f,g,h,i) 1018 1019 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) 1020 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i) 1021 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i) 1022 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i) 1023 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i) 1024 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i) 1025 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i) 1026 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i) 1027 1028 1029 #define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) 1030 #define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i) 1031 #define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i) 1032 #define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i) 1033 #define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i) 1034 #define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i) 1035 #define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i) 1036 #define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i) 1037 1038 #define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) 1039 #define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i) 1040 #define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i) 1041 #define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i) 1042 #define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i) 1043 #define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i) 1044 #define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i) 1045 #define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i) 1046 1047 #define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) 1048 #define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i) 1049 #define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i) 1050 #define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i) 1051 #define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i) 1052 #define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i) 1053 #define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i) 1054 #define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i) 1055 1056 #define FOR(cnt, init, max, inc, loop) \ 1057 __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):) 1058 #define END(cnt, init, max, inc, loop) \ 1059 __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::); 1408 /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \ 1409 "xorl %" #a ", " L2 "\n\t" \ 1410 /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \ 1411 "addl " L3 ", %" #h "\n\t" \ 1412 /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \ 1413 "rorl $2, " L2 "\n\t" \ 1414 /* L1 = d (e of next RND) */ \ 1415 "movl %" #d ", " L1 "\n\t" \ 1416 /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ 1417 "addl " L2 ", %" #h "\n\t" \ 1418 1419 1420 #define RND_ALL_0(a, b, c, d, e, f, g, h, i) \ 1421 _RND_ALL_0(a, b, c, d, e, f, g, h, i) 1422 #define RND_ALL_1(a, b, c, d, e, f, g, h, i) \ 1423 _RND_ALL_1(a, b, c, d, e, f, g, h, i) 1424 1425 #define RND_ALL_4(a, b, c, d, e, f, g, h, i) \ 1426 RND_ALL_0(a, b, c, d, e, f, g, h, i+0) \ 1427 RND_ALL_1(h, a, b, c, d, e, f, g, i+1) \ 1428 RND_ALL_0(g, h, a, b, c, d, e, f, i+2) \ 1429 RND_ALL_1(f, g, h, a, b, c, d, e, i+3) 1060 1430 1061 1431 #endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */ … … 1063 1433 #if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */ 1064 1434 1065 #define VPALIGNR(op1,op2,op3,op4)\1435 #define _VPALIGNR(op1, op2, op3, op4) \ 1066 1436 "vpalignr $"#op4", %"#op3", %"#op2", %"#op1"\n\t" 1067 #define VPADDD(op1,op2,op3) \ 1437 #define VPALIGNR(op1, op2, op3, op4) \ 1438 _VPALIGNR(op1, op2, op3, op4) 1439 #define _VPADDD(op1, op2, op3) \ 1068 1440 "vpaddd %"#op3", %"#op2", %"#op1"\n\t" 1069 #define VPSRLD(op1,op2,op3) \ 1441 #define VPADDD(op1, op2, op3) \ 1442 _VPADDD(op1, op2, op3) 1443 #define _VPSRLD(op1, op2, op3) \ 1070 1444 "vpsrld $"#op3", %"#op2", %"#op1"\n\t" 1071 #define VPSRLQ(op1,op2,op3) \ 1445 #define VPSRLD(op1, op2, op3) \ 1446 _VPSRLD(op1, op2, op3) 1447 #define _VPSRLQ(op1, op2, op3) \ 1072 1448 "vpsrlq $"#op3", %"#op2", %"#op1"\n\t" 1073 #define VPSLLD(op1,op2,op3) \ 1449 #define VPSRLQ(op1,op2,op3) \ 1450 _VPSRLQ(op1,op2,op3) 1451 #define _VPSLLD(op1,op2,op3) \ 1074 1452 "vpslld $"#op3", %"#op2", %"#op1"\n\t" 1075 #define VPOR(op1,op2,op3) \ 1453 #define VPSLLD(op1,op2,op3) \ 1454 _VPSLLD(op1,op2,op3) 1455 #define _VPOR(op1,op2,op3) \ 1076 1456 "vpor %"#op3", %"#op2", %"#op1"\n\t" 1077 #define VPXOR(op1,op2,op3) \ 1457 #define VPOR(op1,op2,op3) \ 1458 _VPOR(op1,op2,op3) 1459 #define _VPXOR(op1,op2,op3) \ 1078 1460 "vpxor %"#op3", %"#op2", %"#op1"\n\t" 1079 #define VPSHUFD(op1,op2,op3) \ 1461 #define VPXOR(op1,op2,op3) \ 1462 _VPXOR(op1,op2,op3) 1463 #define _VPSHUFD(op1,op2,op3) \ 1080 1464 "vpshufd $"#op3", %"#op2", %"#op1"\n\t" 1081 #define VPSHUFB(op1,op2,op3) \ 1465 #define VPSHUFD(op1,op2,op3) \ 1466 _VPSHUFD(op1,op2,op3) 1467 #define _VPSHUFB(op1,op2,op3) \ 1082 1468 "vpshufb %"#op3", %"#op2", %"#op1"\n\t" 1083 1084 #define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\ 1085 a,b,c,d,e,f,g,h,_i)\ 1086 RND_STEP_1(a,b,c,d,e,f,g,h,_i)\ 1469 #define VPSHUFB(op1,op2,op3) \ 1470 _VPSHUFB(op1,op2,op3) 1471 #define _VPSLLDQ(op1,op2,op3) \ 1472 "vpslldq $" #op3", %" #op2", %" #op1"\n\t" 1473 #define VPSLLDQ(op1,op2,op3) \ 1474 _VPSLLDQ(op1,op2,op3) 1475 1476 #define MsgSched(X0,X1,X2,X3,a,b,c,d,e,f,g,h,_i) \ 1477 RND_STEP_0_1(a,b,c,d,e,f,g,h,_i) \ 1478 VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */\ 1479 VPALIGNR (XTMP0, X3, X2, 4) /* XTMP0 = W[-7] */ \ 1480 RND_STEP_0_2(a,b,c,d,e,f,g,h,_i) \ 1481 RND_STEP_0_3(a,b,c,d,e,f,g,h,_i) \ 1482 VPSRLD (XTMP2, XTMP1, 7) /* XTMP2 = W[-15] >> 7 */ \ 1483 VPSLLD (XTMP3, XTMP1, 25) /* XTEMP3 = W[-15] << (32-7) */ \ 1484 RND_STEP_0_4(a,b,c,d,e,f,g,h,_i) \ 1485 RND_STEP_0_5(a,b,c,d,e,f,g,h,_i) \ 1486 VPSRLD (XTMP4, XTMP1, 18) /* XTEMP4 = W[-15] >> 18 */ \ 1487 VPSLLD (XTMP5, XTMP1, 14) /* XTEMP5 = W[-15] << (32-18) */ \ 1488 RND_STEP_0_6(a,b,c,d,e,f,g,h,_i) \ 1489 RND_STEP_0_7(a,b,c,d,e,f,g,h,_i) \ 1490 VPOR (XTMP2, XTMP3, XTMP2) /* XTMP2 = W[-15] >>> 7 */ \ 1491 VPOR (XTMP4, XTMP5, XTMP4) /* XTMP4 = W[-15] >>> 18 */ \ 1492 RND_STEP_0_8(a,b,c,d,e,f,g,h,_i) \ 1493 RND_STEP_1_1(h,a,b,c,d,e,f,g,_i+1) \ 1494 RND_STEP_1_2(h,a,b,c,d,e,f,g,_i+1) \ 1495 VPSRLD (XTMP5, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */ \ 1496 VPXOR (XTMP2, XTMP4, XTMP2) \ 1497 /* XTMP2 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */ \ 1498 RND_STEP_1_3(h,a,b,c,d,e,f,g,_i+1) \ 1499 RND_STEP_1_4(h,a,b,c,d,e,f,g,_i+1) \ 1500 VPXOR (XTMP1, XTMP5, XTMP2) /* XTMP1 = s0 */ \ 1501 VPSHUFD(XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/\ 1502 RND_STEP_1_5(h,a,b,c,d,e,f,g,_i+1) \ 1503 RND_STEP_1_6(h,a,b,c,d,e,f,g,_i+1) \ 1504 VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */\ 1505 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ 1506 RND_STEP_1_7(h,a,b,c,d,e,f,g,_i+1) \ 1507 RND_STEP_1_8(h,a,b,c,d,e,f,g,_i+1) \ 1508 RND_STEP_0_1(g,h,a,b,c,d,e,f,_i+2) \ 1509 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ 1510 VPADDD (XTMP0, XTMP0, X0) \ 1511 RND_STEP_0_2(g,h,a,b,c,d,e,f,_i+2) \ 1512 RND_STEP_0_3(g,h,a,b,c,d,e,f,_i+2) \ 1513 RND_STEP_0_4(g,h,a,b,c,d,e,f,_i+2) \ 1514 VPXOR (XTMP2, XTMP3, XTMP2) \ 1515 VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */ \ 1516 RND_STEP_0_5(g,h,a,b,c,d,e,f,_i+2) \ 1517 VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */\ 1518 RND_STEP_0_6(g,h,a,b,c,d,e,f,_i+2) \ 1519 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */\ 1520 RND_STEP_0_7(g,h,a,b,c,d,e,f,_i+2) \ 1521 VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */\ 1522 RND_STEP_0_8(g,h,a,b,c,d,e,f,_i+2) \ 1523 RND_STEP_1_1(f,g,h,a,b,c,d,e,_i+3) \ 1524 VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */\ 1525 RND_STEP_1_2(f,g,h,a,b,c,d,e,_i+3) \ 1526 VPSRLQ (XTMP4, XTMP2, 17) /* XTMP4 = W[-2] MY_ROR 17 {xDxC} */ \ 1527 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ 1528 RND_STEP_1_3(f,g,h,a,b,c,d,e,_i+3) \ 1529 RND_STEP_1_4(f,g,h,a,b,c,d,e,_i+3) \ 1530 VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */ \ 1531 VPXOR (XTMP4, XTMP3, XTMP4) \ 1532 RND_STEP_1_5(f,g,h,a,b,c,d,e,_i+3) \ 1533 RND_STEP_1_6(f,g,h,a,b,c,d,e,_i+3) \ 1534 VPXOR (XTMP5, XTMP4, XTMP5) /* XTMP5 = s1 {xDxC} */ \ 1535 RND_STEP_1_7(f,g,h,a,b,c,d,e,_i+3) \ 1536 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */\ 1537 RND_STEP_1_8(f,g,h,a,b,c,d,e,_i+3) \ 1538 VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */ 1539 1540 #if defined(HAVE_INTEL_RORX) 1541 1542 #define MsgSched_RORX(X0,X1,X2,X3,a,b,c,d,e,f,g,h,_i) \ 1543 RND_STEP_RORX_0_1(a,b,c,d,e,f,g,h,_i) \ 1087 1544 VPALIGNR (XTMP0, X3, X2, 4)\ 1088 RND_STEP_2(a,b,c,d,e,f,g,h,_i)\1089 VPADDD (XTMP0, XTMP0, X0)\1090 RND_STEP_3(a,b,c,d,e,f,g,h,_i)\1091 1545 VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */\ 1092 RND_STEP_4(a,b,c,d,e,f,g,h,_i)\ 1546 RND_STEP_RORX_0_2(a,b,c,d,e,f,g,h,_i) \ 1547 RND_STEP_RORX_0_3(a,b,c,d,e,f,g,h,_i) \ 1093 1548 VPSRLD (XTMP2, XTMP1, 7)\ 1094 RND_STEP_5(a,b,c,d,e,f,g,h,_i)\1095 1549 VPSLLD (XTMP3, XTMP1, 25) /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ 1096 RND_STEP_6(a,b,c,d,e,f,g,h,_i)\ 1550 RND_STEP_RORX_0_4(a,b,c,d,e,f,g,h,_i) \ 1551 RND_STEP_RORX_0_5(a,b,c,d,e,f,g,h,_i) \ 1552 VPSRLD (XTMP4, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */ \ 1097 1553 VPOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 */\ 1098 RND_STEP_ 7(a,b,c,d,e,f,g,h,_i)\1099 VPSRLD (XTMP2, XTMP1,18)\1100 RND_STEP_ 8(a,b,c,d,e,f,g,h,_i)\1554 RND_STEP_RORX_0_6(a,b,c,d,e,f,g,h,_i) \ 1555 RND_STEP_RORX_0_7(a,b,c,d,e,f,g,h,_i) \ 1556 RND_STEP_RORX_0_8(a,b,c,d,e,f,g,h,_i) \ 1101 1557 \ 1102 RND_STEP_ 1(h,a,b,c,d,e,f,g,_i+1)\1103 VPSRLD (XTMP 4, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */\1104 RND_STEP_ 2(h,a,b,c,d,e,f,g,_i+1)\1558 RND_STEP_RORX_1_1(h,a,b,c,d,e,f,g,_i+1) \ 1559 VPSRLD (XTMP2, XTMP1,18) \ 1560 RND_STEP_RORX_1_2(h,a,b,c,d,e,f,g,_i+1) \ 1105 1561 VPSLLD (XTMP1, XTMP1, 14) /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ 1106 RND_STEP_ 3(h,a,b,c,d,e,f,g,_i+1)\1562 RND_STEP_RORX_1_3(h,a,b,c,d,e,f,g,_i+1) \ 1107 1563 VPXOR (XTMP3, XTMP3, XTMP1)\ 1108 RND_STEP_4(h,a,b,c,d,e,f,g,_i+1)\ 1109 VPXOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ 1110 RND_STEP_5(h,a,b,c,d,e,f,g,_i+1)\ 1111 VPXOR (XTMP1, XTMP3, XTMP4) /* XTMP1 = s0 */\ 1112 RND_STEP_6(h,a,b,c,d,e,f,g,_i+1)\ 1564 RND_STEP_RORX_1_4(h,a,b,c,d,e,f,g,_i+1) \ 1565 VPXOR (XTMP3, XTMP3, XTMP2) \ 1566 /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */ \ 1567 RND_STEP_RORX_1_5(h,a,b,c,d,e,f,g,_i+1) \ 1113 1568 VPSHUFD(XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/\ 1114 RND_STEP_7(h,a,b,c,d,e,f,g,_i+1)\ 1115 VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */\ 1116 RND_STEP_8(h,a,b,c,d,e,f,g,_i+1)\ 1569 RND_STEP_RORX_1_6(h,a,b,c,d,e,f,g,_i+1) \ 1570 VPXOR (XTMP1, XTMP3, XTMP4) /* XTMP1 = s0 */ \ 1571 RND_STEP_RORX_1_7(h,a,b,c,d,e,f,g,_i+1) \ 1572 VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */\ 1573 RND_STEP_RORX_1_8(h,a,b,c,d,e,f,g,_i+1) \ 1574 \ 1575 RND_STEP_RORX_0_1(g,h,a,b,c,d,e,f,_i+2) \ 1576 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ 1577 RND_STEP_RORX_0_2(g,h,a,b,c,d,e,f,_i+2) \ 1578 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ 1579 VPADDD (XTMP0, XTMP0, X0) \ 1580 RND_STEP_RORX_0_3(g,h,a,b,c,d,e,f,_i+2) \ 1581 VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */ \ 1582 RND_STEP_RORX_0_4(g,h,a,b,c,d,e,f,_i+2) \ 1583 VPXOR (XTMP2, XTMP2, XTMP3)\ 1584 RND_STEP_RORX_0_5(g,h,a,b,c,d,e,f,_i+2) \ 1585 VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */\ 1586 RND_STEP_RORX_0_6(g,h,a,b,c,d,e,f,_i+2) \ 1587 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */\ 1588 RND_STEP_RORX_0_7(g,h,a,b,c,d,e,f,_i+2) \ 1589 VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */\ 1590 RND_STEP_RORX_0_8(g,h,a,b,c,d,e,f,_i+2) \ 1117 1591 \ 1118 RND_STEP_1(g,h,a,b,c,d,e,f,_i+2)\ 1119 VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */\ 1120 RND_STEP_2(g,h,a,b,c,d,e,f,_i+2)\ 1121 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ 1122 RND_STEP_3(g,h,a,b,c,d,e,f,_i+2)\ 1123 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ 1124 RND_STEP_4(g,h,a,b,c,d,e,f,_i+2)\ 1592 RND_STEP_RORX_1_1(f,g,h,a,b,c,d,e,_i+3) \ 1593 VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */\ 1594 RND_STEP_RORX_1_2(f,g,h,a,b,c,d,e,_i+3) \ 1595 VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */\ 1596 RND_STEP_RORX_1_3(f,g,h,a,b,c,d,e,_i+3) \ 1597 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ 1598 RND_STEP_RORX_1_4(f,g,h,a,b,c,d,e,_i+3) \ 1599 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ 1600 RND_STEP_RORX_1_5(f,g,h,a,b,c,d,e,_i+3) \ 1125 1601 VPXOR (XTMP2, XTMP2, XTMP3)\ 1126 RND_STEP_5(g,h,a,b,c,d,e,f,_i+2)\ 1127 VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */\ 1128 RND_STEP_6(g,h,a,b,c,d,e,f,_i+2)\ 1129 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */\ 1130 RND_STEP_7(g,h,a,b,c,d,e,f,_i+2)\ 1131 VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */\ 1132 RND_STEP_8(g,h,a,b,c,d,e,f,_i+2)\ 1133 \ 1134 RND_STEP_1(f,g,h,a,b,c,d,e,_i+3)\ 1135 VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */\ 1136 RND_STEP_2(f,g,h,a,b,c,d,e,_i+3)\ 1137 VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */\ 1138 RND_STEP_3(f,g,h,a,b,c,d,e,_i+3)\ 1139 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ 1140 RND_STEP_4(f,g,h,a,b,c,d,e,_i+3)\ 1141 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ 1142 RND_STEP_5(f,g,h,a,b,c,d,e,_i+3)\ 1143 VPXOR (XTMP2, XTMP2, XTMP3)\ 1144 RND_STEP_6(f,g,h,a,b,c,d,e,_i+3)\ 1602 RND_STEP_RORX_1_6(f,g,h,a,b,c,d,e,_i+3) \ 1145 1603 VPXOR (XTMP5, XTMP5, XTMP2) /* XTMP5 = s1 {xDxC} */\ 1146 RND_STEP_ 7(f,g,h,a,b,c,d,e,_i+3)\1604 RND_STEP_RORX_1_7(f,g,h,a,b,c,d,e,_i+3) \ 1147 1605 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */\ 1148 RND_STEP_8(f,g,h,a,b,c,d,e,_i+3)\ 1149 VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */\ 1150 1151 #if defined(HAVE_INTEL_RORX) 1152 1153 #define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \ 1154 XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\ 1155 RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i)\ 1156 VPALIGNR (XTMP0, X3, X2, 4)\ 1157 RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i)\ 1158 VPADDD (XTMP0, XTMP0, X0)\ 1159 RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i)\ 1160 VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */\ 1161 RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i)\ 1162 VPSRLD (XTMP2, XTMP1, 7)\ 1163 RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i)\ 1164 VPSLLD (XTMP3, XTMP1, 25) /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ 1165 RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i)\ 1166 VPOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 */\ 1167 RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i)\ 1168 VPSRLD (XTMP2, XTMP1,18)\ 1169 RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i)\ 1170 \ 1171 RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1)\ 1172 VPSRLD (XTMP4, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */\ 1173 RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1)\ 1174 VPSLLD (XTMP1, XTMP1, 14) /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ 1175 RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1)\ 1176 VPXOR (XTMP3, XTMP3, XTMP1)\ 1177 RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1)\ 1178 VPXOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ 1179 RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1)\ 1180 VPXOR (XTMP1, XTMP3, XTMP4) /* XTMP1 = s0 */\ 1181 RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1)\ 1182 VPSHUFD(XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/\ 1183 RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1)\ 1184 VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */\ 1185 RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1)\ 1186 \ 1187 RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2)\ 1188 VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */\ 1189 RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2)\ 1190 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ 1191 RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2)\ 1192 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ 1193 RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2)\ 1194 VPXOR (XTMP2, XTMP2, XTMP3)\ 1195 RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2)\ 1196 VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */\ 1197 RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2)\ 1198 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */\ 1199 RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2)\ 1200 VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */\ 1201 RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2)\ 1202 \ 1203 RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3)\ 1204 VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */\ 1205 RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3)\ 1206 VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */\ 1207 RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3)\ 1208 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ 1209 RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3)\ 1210 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ 1211 RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3)\ 1212 VPXOR (XTMP2, XTMP2, XTMP3)\ 1213 RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3)\ 1214 VPXOR (XTMP5, XTMP5, XTMP2) /* XTMP5 = s1 {xDxC} */\ 1215 RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3)\ 1216 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */\ 1217 RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3)\ 1218 VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */\ 1606 RND_STEP_RORX_1_8(f,g,h,a,b,c,d,e,_i+3) \ 1607 VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */ 1219 1608 1220 1609 #endif /* HAVE_INTEL_RORX */ 1221 1610 1222 1611 1223 #define W_K_from_buff() \ 1224 "leaq %[buf], %%r8\n\t" \ 1225 "vmovdqu (%%r8), %%xmm4\n\t" \ 1226 "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t" \ 1227 "vmovdqu 16(%%r8), %%xmm5\n\t" \ 1228 "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t" \ 1229 "vmovdqu 32(%%r8), %%xmm6\n\t" \ 1230 "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t" \ 1231 "vmovdqu 48(%%r8), %%xmm7\n\t" \ 1232 "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t" 1233 1234 #define _SET_W_K_XFER(reg, i) \ 1235 "leaq %[K], %%r8\n\t" \ 1236 "vpaddd ("#i")*4(%%r8), %"#reg", %%xmm9\n\t" \ 1237 "leaq %[W_K], %%r8\n\t" \ 1238 "vmovdqa %%xmm9, ("#i")*4(%%r8)\n\t" 1239 1240 #define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i) 1241 1242 static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */ 1243 static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */ 1244 static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b }; 1245 1612 #define _W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) \ 1613 "# X0, X1, X2, X3 = W[0..15]\n\t" \ 1614 "vmovdqu (%%rax), %" #X0 "\n\t" \ 1615 "vmovdqu 16(%%rax), %" #X1 "\n\t" \ 1616 VPSHUFB(X0, X0, BYTE_FLIP_MASK) \ 1617 VPSHUFB(X1, X1, BYTE_FLIP_MASK) \ 1618 "vmovdqu 32(%%rax), %" #X2 "\n\t" \ 1619 "vmovdqu 48(%%rax), %" #X3 "\n\t" \ 1620 VPSHUFB(X2, X2, BYTE_FLIP_MASK) \ 1621 VPSHUFB(X3, X3, BYTE_FLIP_MASK) 1622 1623 #define W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) \ 1624 _W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) 1625 1626 1627 #define _SET_W_K_XFER_4(i) \ 1628 "vpaddd (" #i "*4)+ 0+%[K], %%xmm0, %%xmm4\n\t" \ 1629 "vpaddd (" #i "*4)+16+%[K], %%xmm1, %%xmm5\n\t" \ 1630 "vmovdqu %%xmm4, (" WK ")\n\t" \ 1631 "vmovdqu %%xmm5, 16(" WK ")\n\t" \ 1632 "vpaddd (" #i "*4)+32+%[K], %%xmm2, %%xmm6\n\t" \ 1633 "vpaddd (" #i "*4)+48+%[K], %%xmm3, %%xmm7\n\t" \ 1634 "vmovdqu %%xmm6, 32(" WK ")\n\t" \ 1635 "vmovdqu %%xmm7, 48(" WK ")\n\t" 1636 1637 #define SET_W_K_XFER_4(i) \ 1638 _SET_W_K_XFER_4(i) 1639 1640 1641 static const ALIGN32 word64 mSHUF_00BA[] = 1642 { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */ 1643 static const ALIGN32 word64 mSHUF_DC00[] = 1644 { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */ 1645 static const ALIGN32 word64 mBYTE_FLIP_MASK[] = 1646 { 0x0405060700010203, 0x0c0d0e0f08090a0b }; 1246 1647 1247 1648 #define _Init_Masks(mask1, mask2, mask3) \ 1248 "vmovdq u %[FLIP], %"#mask1"\n\t" \1249 "vmovdq u %[SHUF00BA], %"#mask2"\n\t" \1250 "vmovdq u %[SHUFDC00], %"#mask3"\n\t"1649 "vmovdqa %[FLIP], %" #mask1 "\n\t" \ 1650 "vmovdqa %[SHUF00BA], %" #mask2 "\n\t" \ 1651 "vmovdqa %[SHUFDC00], %" #mask3 "\n\t" 1251 1652 1252 1653 #define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\ 1253 1654 _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 1254 1655 1255 #define X0 %xmm4 1256 #define X1 %xmm5 1257 #define X2 %xmm6 1258 #define X3 %xmm7 1259 #define X_ X0 1260 1261 #define XTMP0 %xmm0 1262 #define XTMP1 %xmm1 1263 #define XTMP2 %xmm2 1264 #define XTMP3 %xmm3 1656 #define X0 %xmm0 1657 #define X1 %xmm1 1658 #define X2 %xmm2 1659 #define X3 %xmm3 1660 1661 #define XTMP0 %xmm4 1662 #define XTMP1 %xmm5 1663 #define XTMP2 %xmm6 1664 #define XTMP3 %xmm7 1265 1665 #define XTMP4 %xmm8 1266 1666 #define XTMP5 %xmm9 … … 1272 1672 1273 1673 1274 static int Transform_AVX1(wc_Sha256* sha256)1674 SHA256_NOINLINE static int Transform_Sha256_AVX1(wc_Sha256* sha256) 1275 1675 { 1276 ALIGN32 word32 W_K[64]; /* temp for W+K */1277 1278 1676 __asm__ __volatile__ ( 1279 1677 1678 "subq $64, %%rsp\n\t" 1679 1680 "leaq 32(%[sha256]), %%rax\n\t" 1280 1681 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 1281 "# X0, X1, X2, X3 = W[0..15]; \n\t" 1282 W_K_from_buff() 1283 1284 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) 1285 1286 SET_W_K_XFER(X0, 0) 1287 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 1288 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) 1289 SET_W_K_XFER(X1, 4) 1290 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 1291 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) 1292 SET_W_K_XFER(X2, 8) 1293 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 1294 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) 1295 SET_W_K_XFER(X3, 12) 1296 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 1297 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) 1298 SET_W_K_XFER(X0, 16) 1299 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 1300 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) 1301 SET_W_K_XFER(X1, 20) 1302 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 1303 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) 1304 SET_W_K_XFER(X2, 24) 1305 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 1306 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) 1307 SET_W_K_XFER(X3, 28) 1308 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 1309 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) 1310 SET_W_K_XFER(X0, 32) 1311 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 1312 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) 1313 SET_W_K_XFER(X1, 36) 1314 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 1315 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) 1316 SET_W_K_XFER(X2, 40) 1317 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 1318 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) 1319 SET_W_K_XFER(X3, 44) 1320 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 1321 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) 1322 1323 SET_W_K_XFER(X0, 48) 1324 SET_W_K_XFER(X1, 52) 1325 SET_W_K_XFER(X2, 56) 1326 SET_W_K_XFER(X3, 60) 1327 1328 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) 1329 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) 1330 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) 1331 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) 1332 1333 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) 1334 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) 1335 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) 1336 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) 1337 1338 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) 1339 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) 1340 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) 1341 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) 1342 1343 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) 1344 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) 1345 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) 1346 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) 1347 1348 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) 1682 LOAD_DIGEST() 1683 1684 W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) 1685 1686 "movl %%r9d, " L4 "\n\t" 1687 "movl %%r12d, " L1 "\n\t" 1688 "xorl %%r10d, " L4 "\n\t" 1689 1690 SET_W_K_XFER_4(0) 1691 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1692 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1693 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1694 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1695 1696 SET_W_K_XFER_4(16) 1697 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1698 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1699 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1700 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1701 1702 SET_W_K_XFER_4(32) 1703 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1704 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1705 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1706 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1707 1708 SET_W_K_XFER_4(48) 1709 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1710 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1711 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1712 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1713 1714 STORE_ADD_DIGEST() 1715 1716 "addq $64, %%rsp\n\t" 1349 1717 1350 1718 : … … 1352 1720 [SHUF00BA] "m" (mSHUF_00BA[0]), 1353 1721 [SHUFDC00] "m" (mSHUF_DC00[0]), 1354 [digest] "m" (sha256->digest), 1355 [buf] "m" (sha256->buffer), 1356 [K] "m" (K), 1357 [W_K] "m" (W_K) 1358 : SSE_REGs, "memory" 1722 [sha256] "r" (sha256), 1723 [K] "m" (K) 1724 : WORK_REGS, STATE_REGS, XMM_REGS, "memory" 1359 1725 ); 1360 1726 … … 1362 1728 } 1363 1729 1364 #if defined(HAVE_INTEL_RORX) 1365 static int Transform_AVX1_RORX(wc_Sha256* sha256)1730 SHA256_NOINLINE static int Transform_Sha256_AVX1_Len(wc_Sha256* sha256, 1731 word32 len) 1366 1732 { 1367 ALIGN32 word32 W_K[64]; /* temp for W+K */1368 1369 1733 __asm__ __volatile__ ( 1370 1734 1735 "subq $64, %%rsp\n\t" 1736 "movq 120(%[sha256]), %%rax\n\t" 1737 1371 1738 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 1372 "# X0, X1, X2, X3 = W[0..15]; \n\t" 1373 W_K_from_buff() 1374 1375 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) 1376 1377 SET_W_K_XFER(X0, 0) 1378 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 1379 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) 1380 SET_W_K_XFER(X1, 4) 1381 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 1382 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) 1383 SET_W_K_XFER(X2, 8) 1384 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 1385 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) 1386 SET_W_K_XFER(X3, 12) 1387 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 1388 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) 1389 SET_W_K_XFER(X0, 16) 1390 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 1391 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) 1392 SET_W_K_XFER(X1, 20) 1393 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 1394 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) 1395 SET_W_K_XFER(X2, 24) 1396 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 1397 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) 1398 SET_W_K_XFER(X3, 28) 1399 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 1400 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) 1401 SET_W_K_XFER(X0, 32) 1402 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 1403 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) 1404 SET_W_K_XFER(X1, 36) 1405 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 1406 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) 1407 SET_W_K_XFER(X2, 40) 1408 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 1409 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) 1410 SET_W_K_XFER(X3, 44) 1411 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 1412 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) 1413 1414 SET_W_K_XFER(X0, 48) 1415 SET_W_K_XFER(X1, 52) 1416 SET_W_K_XFER(X2, 56) 1417 SET_W_K_XFER(X3, 60) 1418 1419 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) 1420 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) 1421 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) 1422 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) 1423 1424 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) 1425 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) 1426 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) 1427 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) 1428 1429 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) 1430 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) 1431 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) 1432 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) 1433 1434 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) 1435 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) 1436 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) 1437 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) 1438 1439 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) 1739 LOAD_DIGEST() 1740 1741 "# Start of loop processing a block\n" 1742 "1:\n\t" 1743 1744 W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) 1745 1746 "movl %%r9d, " L4 "\n\t" 1747 "movl %%r12d, " L1 "\n\t" 1748 "xorl %%r10d, " L4 "\n\t" 1749 1750 SET_W_K_XFER_4(0) 1751 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1752 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1753 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1754 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1755 1756 SET_W_K_XFER_4(16) 1757 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1758 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1759 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1760 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1761 1762 SET_W_K_XFER_4(32) 1763 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1764 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1765 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1766 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1767 1768 SET_W_K_XFER_4(48) 1769 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1770 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1771 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1772 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1773 "movq 120(%[sha256]), %%rax\n\t" 1774 1775 ADD_DIGEST() 1776 1777 "addq $64, %%rax\n\t" 1778 "subl $64, %[len]\n\t" 1779 1780 STORE_DIGEST() 1781 1782 "movq %%rax, 120(%[sha256])\n\t" 1783 "jnz 1b\n\t" 1784 1785 "addq $64, %%rsp\n\t" 1440 1786 1441 1787 : … … 1443 1789 [SHUF00BA] "m" (mSHUF_00BA[0]), 1444 1790 [SHUFDC00] "m" (mSHUF_DC00[0]), 1445 [digest] "m" (sha256->digest), 1446 [buf] "m" (sha256->buffer), 1447 [K] "m" (K), 1448 [W_K] "m" (W_K) 1449 : SSE_REGs, "memory" 1791 [sha256] "r" (sha256), 1792 [len] "r" (len), 1793 [K] "m" (K) 1794 : WORK_REGS, STATE_REGS, XMM_REGS, "memory" 1450 1795 ); 1451 1796 1452 1797 return 0; 1453 1798 } 1454 #endif /* HAVE_INTEL_RORX */1455 1799 #endif /* HAVE_INTEL_AVX1 */ 1456 1800 1457 1458 #if defined(HAVE_INTEL_AVX2) 1459 1460 #define _MOVE_to_REG(ymm, mem, i) \ 1461 "leaq %["#mem"], %%r8\n\t" \ 1462 "vmovdqu ("#i")*4(%%r8), %%"#ymm"\n\t" 1463 #define _MOVE_to_MEM(mem, i, ymm) \ 1464 "leaq %["#mem"], %%r8\n\t" \ 1465 "vmovdqu %%"#ymm", "#i"*4(%%r8)\n\t" 1466 #define _BYTE_SWAP(ymm, map) \ 1467 "vpshufb %["#map"], %%"#ymm", %%"#ymm"\n\t" 1468 #define _MOVE_128(ymm0, ymm1, ymm2, map) \ 1469 "vperm2i128 $"#map", %%"#ymm2", %%"#ymm1", %%"#ymm0"\n\t" 1470 #define _MOVE_BYTE(ymm0, ymm1, map) \ 1471 "vpshufb %["#map"], %%"#ymm1", %%"#ymm0"\n\t" 1472 #define _S_TEMP(dest, src, bits, temp) \ 1473 "vpsrld $"#bits", %%"#src", %%"#dest"\n\t" \ 1474 "vpslld $32-"#bits", %%"#src", %%"#temp"\n\t" \ 1475 "vpor %%"#temp",%%"#dest", %%"#dest"\n\t" 1476 #define _AVX2_R(dest, src, bits) \ 1477 "vpsrld $"#bits", %%"#src", %%"#dest"\n\t" 1478 #define _XOR(dest, src1, src2) \ 1479 "vpxor %%"#src1", %%"#src2", %%"#dest"\n\t" 1480 #define _OR(dest, src1, src2) \ 1481 "vpor %%"#src1", %%"#src2", %%"#dest"\n\t" 1482 #define _ADD(dest, src1, src2) \ 1483 "vpaddd %%"#src1", %%"#src2", %%"#dest"\n\t" 1484 #define _ADD_MEM(dest, src1, mem, i) \ 1485 "leaq %["#mem"], %%r8\n\t" \ 1486 "vpaddd "#i"*4(%%r8), %%"#src1", %%"#dest"\n\t" 1487 #define _BLEND(map, dest, src1, src2) \ 1488 "vpblendd $"#map", %%"#src1", %%"#src2", %%"#dest"\n\t" 1489 1490 #define _EXTRACT_XMM_0(xmm, mem) \ 1491 "vpextrd $0, %%"#xmm", %["#mem"]\n\t" 1492 #define _EXTRACT_XMM_1(xmm, mem) \ 1493 "vpextrd $1, %%"#xmm", %["#mem"]\n\t" 1494 #define _EXTRACT_XMM_2(xmm, mem) \ 1495 "vpextrd $2, %%"#xmm", %["#mem"]\n\t" 1496 #define _EXTRACT_XMM_3(xmm, mem) \ 1497 "vpextrd $3, %%"#xmm", %["#mem"]\n\t" 1498 #define _EXTRACT_XMM_4(ymm, xmm, mem) \ 1499 "vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm"\n\t" \ 1500 "vpextrd $0, %%"#xmm", %["#mem"]\n\t" 1501 #define _EXTRACT_XMM_5(xmm, mem) \ 1502 "vpextrd $1, %%"#xmm", %["#mem"]\n\t" 1503 #define _EXTRACT_XMM_6(xmm, mem) \ 1504 "vpextrd $2, %%"#xmm", %["#mem"]\n\t" 1505 #define _EXTRACT_XMM_7(xmm, mem) \ 1506 "vpextrd $3, %%"#xmm", %["#mem"]\n\t" 1507 1508 #define _SWAP_YMM_HL(ymm) \ 1509 "vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm"\n\t" 1510 #define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm) 1511 1512 #define MOVE_to_REG(ymm, mem, i) _MOVE_to_REG(ymm, mem, i) 1513 #define MOVE_to_MEM(mem, i, ymm) _MOVE_to_MEM(mem, i, ymm) 1514 #define BYTE_SWAP(ymm, map) _BYTE_SWAP(ymm, map) 1515 #define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map) 1516 #define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map) 1517 #define XOR(dest, src1, src2) _XOR(dest, src1, src2) 1518 #define OR(dest, src1, src2) _OR(dest, src1, src2) 1519 #define ADD(dest, src1, src2) _ADD(dest, src1, src2) 1520 #define ADD_MEM(dest, src1, mem, i) _ADD_MEM(dest, src1, mem, i) 1521 #define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2) 1522 1523 #define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp) 1524 #define AVX2_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP) 1525 #define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits) 1526 1527 #define GAMMA0(dest, src) AVX2_S(dest, src, 7) AVX2_S(G_TEMP, src, 18) \ 1528 XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 3) XOR(dest, G_TEMP, dest) 1529 #define GAMMA0_1(dest, src) AVX2_S(dest, src, 7) AVX2_S(G_TEMP, src, 18) 1530 #define GAMMA0_2(dest, src) XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 3) \ 1531 XOR(dest, G_TEMP, dest) 1532 1533 #define GAMMA1(dest, src) AVX2_S(dest, src, 17) AVX2_S(G_TEMP, src, 19) \ 1534 XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 10) XOR(dest, G_TEMP, dest) 1535 #define GAMMA1_1(dest, src) AVX2_S(dest, src, 17) AVX2_S(G_TEMP, src, 19) 1536 #define GAMMA1_2(dest, src) XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 10) \ 1537 XOR(dest, G_TEMP, dest) 1538 1539 #define FEEDBACK1_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, MAP1W_2) \ 1540 BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2) 1541 #define FEEDBACK2_to_W_I_2 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) \ 1542 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, MAP2W_2) BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2) 1543 #define FEEDBACK3_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, MAP3W_2) \ 1544 BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2) 1545 1546 #define FEEDBACK_to_W_I_7 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08)\ 1547 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, MAPW_7) BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7) 1548 1549 #undef voitle 1550 1551 #define W_I_16 ymm8 1552 #define W_I_15 ymm9 1553 #define W_I_7 ymm10 1554 #define W_I_2 ymm11 1555 #define W_I ymm12 1556 #define G_TEMP ymm13 1557 #define S_TEMP ymm14 1558 #define YMM_TEMP0 ymm15 1559 #define YMM_TEMP0x xmm15 1560 #define W_I_TEMP ymm7 1561 #define W_K_TEMP ymm15 1562 #define W_K_TEMPx xmm15 1563 1564 #define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\ 1565 "vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15"\n\t" \ 1566 "vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16"\n\t" \ 1567 "vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15"\n\t" \ 1568 "vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16"\n\t" \ 1569 "vpshufd $0x93, %%"#w_i_16", %%"#w_i_16"\n\t" 1570 1571 #define MOVE_7_to_15(w_i_15, w_i_7)\ 1572 "vmovdqu %%"#w_i_7", %%"#w_i_15"\n\t" 1573 1574 #define MOVE_I_to_7(w_i_7, w_i)\ 1575 "vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7"\n\t" \ 1576 "vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7"\n\t" \ 1577 "vpshufd $0x39, %%"#w_i_7", %%"#w_i_7"\n\t" 1578 1579 #define MOVE_I_to_2(w_i_2, w_i)\ 1580 "vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2"\n\t" \ 1581 "vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2"\n\t" 1582 1583 #define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\ 1584 MOVE_15_to_16(w_i_16, w_i_15, w_i_7) \ 1585 MOVE_7_to_15(w_i_15, w_i_7) \ 1586 MOVE_I_to_7(w_i_7, w_i) \ 1587 MOVE_I_to_2(w_i_2, w_i) 1588 1589 #define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 1590 { word32 d[8];\ 1591 __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs);\ 1592 __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs);\ 1593 __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs);\ 1594 __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs);\ 1595 __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs);\ 1596 __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs);\ 1597 __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs);\ 1598 __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs);\ 1599 printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\ 1600 __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs);\ 1601 __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs);\ 1602 __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs);\ 1603 __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs);\ 1604 __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs);\ 1605 __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs);\ 1606 __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs);\ 1607 __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs);\ 1608 } 1609 1610 1611 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 1612 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 1613 1614 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 1615 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 1616 1617 #define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 1618 _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 1619 1620 1621 /* Byte swap Masks to ensure that rest of the words are filled with zero's. */ 1622 static const unsigned long mBYTE_FLIP_MASK_16[] = 1623 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b }; 1624 static const unsigned long mBYTE_FLIP_MASK_15[] = 1625 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b }; 1626 static const unsigned long mBYTE_FLIP_MASK_7 [] = 1627 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b }; 1628 static const unsigned long mBYTE_FLIP_MASK_2 [] = 1629 { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 }; 1630 1631 static const unsigned long mMAPtoW_I_7[] = 1632 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 }; 1633 static const unsigned long mMAP1toW_I_2[] = 1634 { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 }; 1635 static const unsigned long mMAP2toW_I_2[] = 1636 { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 }; 1637 static const unsigned long mMAP3toW_I_2[] = 1638 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 }; 1639 1640 static int Transform_AVX2(wc_Sha256* sha256) 1801 #if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX) 1802 SHA256_NOINLINE static int Transform_Sha256_AVX1_RORX(wc_Sha256* sha256) 1641 1803 { 1642 #ifdef WOLFSSL_SMALL_STACK1643 word32* W_K;1644 W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);1645 if (W_K == NULL)1646 return MEMORY_E;1647 #else1648 word32 W_K[64];1649 #endif1650 1651 1804 __asm__ __volatile__ ( 1652 1805 1653 MOVE_to_REG(W_I_16, buf, 0) BYTE_SWAP(W_I_16, FLIP_16) 1654 MOVE_to_REG(W_I_15, buf, 1) BYTE_SWAP(W_I_15, FLIP_15) 1655 MOVE_to_REG(W_I, buf, 8) BYTE_SWAP(W_I, FLIP_16) 1656 MOVE_to_REG(W_I_7, buf, 16-7) BYTE_SWAP(W_I_7, FLIP_7) 1657 MOVE_to_REG(W_I_2, buf, 16-2) BYTE_SWAP(W_I_2, FLIP_2) 1658 1659 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) 1660 1661 ADD_MEM(W_K_TEMP, W_I_16, K, 0) 1662 MOVE_to_MEM(W_K, 0, W_K_TEMP) 1663 1664 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) 1665 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1) 1666 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2) 1667 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3) 1668 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4) 1669 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5) 1670 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6) 1671 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7) 1672 1673 ADD_MEM(YMM_TEMP0, W_I, K, 8) 1674 MOVE_to_MEM(W_K, 8, YMM_TEMP0) 1675 1676 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 1677 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) 1678 GAMMA0_1(W_I_TEMP, W_I_15) 1679 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) 1680 GAMMA0_2(W_I_TEMP, W_I_15) 1681 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) 1682 ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */ 1683 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) 1684 ADD(W_I, W_I_7, W_I_TEMP) 1685 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) 1686 GAMMA1_1(YMM_TEMP0, W_I_2) 1687 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) 1688 GAMMA1_2(YMM_TEMP0, W_I_2) 1689 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) 1690 ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */ 1691 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) 1692 FEEDBACK1_to_W_I_2 1693 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) 1694 FEEDBACK_to_W_I_7 1695 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) 1696 ADD(W_I_TEMP, W_I_7, W_I_TEMP) 1697 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) 1698 GAMMA1_1(YMM_TEMP0, W_I_2) 1699 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) 1700 GAMMA1_2(YMM_TEMP0, W_I_2) 1701 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) 1702 ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */ 1703 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) 1704 FEEDBACK2_to_W_I_2 1705 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) 1706 GAMMA1_1(YMM_TEMP0, W_I_2) 1707 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) 1708 GAMMA1_2(YMM_TEMP0, W_I_2) 1709 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) 1710 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */ 1711 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) 1712 FEEDBACK3_to_W_I_2 1713 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) 1714 GAMMA1(YMM_TEMP0, W_I_2) 1715 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) 1716 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) 1717 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */ 1718 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) 1719 1720 MOVE_to_REG(YMM_TEMP0, K, 16) 1721 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) 1722 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) 1723 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) 1724 ADD(YMM_TEMP0, YMM_TEMP0, W_I) 1725 MOVE_to_MEM(W_K, 16, YMM_TEMP0) 1726 1727 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 1728 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) 1729 GAMMA0_1(W_I_TEMP, W_I_15) 1730 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) 1731 GAMMA0_2(W_I_TEMP, W_I_15) 1732 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) 1733 ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */ 1734 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) 1735 ADD(W_I, W_I_7, W_I_TEMP) 1736 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) 1737 GAMMA1_1(YMM_TEMP0, W_I_2) 1738 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) 1739 GAMMA1_2(YMM_TEMP0, W_I_2) 1740 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) 1741 ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */ 1742 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) 1743 FEEDBACK1_to_W_I_2 1744 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) 1745 FEEDBACK_to_W_I_7 1746 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) 1747 ADD(W_I_TEMP, W_I_7, W_I_TEMP) 1748 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) 1749 GAMMA1(YMM_TEMP0, W_I_2) 1750 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) 1751 GAMMA1_2(YMM_TEMP0, W_I_2) 1752 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) 1753 ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */ 1754 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) 1755 FEEDBACK2_to_W_I_2 1756 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) 1757 GAMMA1_1(YMM_TEMP0, W_I_2) 1758 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) 1759 GAMMA1_2(YMM_TEMP0, W_I_2) 1760 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) 1761 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */ 1762 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) 1763 FEEDBACK3_to_W_I_2 1764 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) 1765 GAMMA1_1(YMM_TEMP0, W_I_2) 1766 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) 1767 GAMMA1_2(YMM_TEMP0, W_I_2) 1768 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) 1769 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */ 1770 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) 1771 1772 MOVE_to_REG(YMM_TEMP0, K, 24) 1773 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) 1774 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) 1775 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) 1776 ADD(YMM_TEMP0, YMM_TEMP0, W_I) 1777 MOVE_to_MEM(W_K, 24, YMM_TEMP0) 1778 1779 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 1780 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) 1781 GAMMA0_1(W_I_TEMP, W_I_15) 1782 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) 1783 GAMMA0_2(W_I_TEMP, W_I_15) 1784 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) 1785 ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */ 1786 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) 1787 ADD(W_I, W_I_7, W_I_TEMP) 1788 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) 1789 GAMMA1_1(YMM_TEMP0, W_I_2) 1790 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) 1791 GAMMA1_2(YMM_TEMP0, W_I_2) 1792 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) 1793 ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */ 1794 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) 1795 FEEDBACK1_to_W_I_2 1796 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) 1797 FEEDBACK_to_W_I_7 1798 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) 1799 ADD(W_I_TEMP, W_I_7, W_I_TEMP) 1800 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) 1801 GAMMA1_1(YMM_TEMP0, W_I_2) 1802 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) 1803 GAMMA1_2(YMM_TEMP0, W_I_2) 1804 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) 1805 ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */ 1806 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) 1807 FEEDBACK2_to_W_I_2 1808 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) 1809 GAMMA1_1(YMM_TEMP0, W_I_2) 1810 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) 1811 GAMMA1_2(YMM_TEMP0, W_I_2) 1812 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) 1813 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */ 1814 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) 1815 FEEDBACK3_to_W_I_2 1816 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) 1817 GAMMA1(YMM_TEMP0, W_I_2) 1818 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) 1819 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) 1820 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */ 1821 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) 1822 1823 MOVE_to_REG(YMM_TEMP0, K, 32) 1824 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) 1825 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) 1826 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) 1827 ADD(YMM_TEMP0, YMM_TEMP0, W_I) 1828 MOVE_to_MEM(W_K, 32, YMM_TEMP0) 1829 1830 1831 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 1832 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) 1833 GAMMA0_1(W_I_TEMP, W_I_15) 1834 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) 1835 GAMMA0_2(W_I_TEMP, W_I_15) 1836 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) 1837 ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */ 1838 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) 1839 ADD(W_I, W_I_7, W_I_TEMP) 1840 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) 1841 GAMMA1_1(YMM_TEMP0, W_I_2) 1842 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) 1843 GAMMA1_2(YMM_TEMP0, W_I_2) 1844 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) 1845 ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */ 1846 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) 1847 FEEDBACK1_to_W_I_2 1848 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) 1849 FEEDBACK_to_W_I_7 1850 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) 1851 ADD(W_I_TEMP, W_I_7, W_I_TEMP) 1852 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) 1853 GAMMA1_1(YMM_TEMP0, W_I_2) 1854 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) 1855 GAMMA1_2(YMM_TEMP0, W_I_2) 1856 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) 1857 ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */ 1858 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) 1859 FEEDBACK2_to_W_I_2 1860 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) 1861 GAMMA1_1(YMM_TEMP0, W_I_2) 1862 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) 1863 GAMMA1_2(YMM_TEMP0, W_I_2) 1864 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) 1865 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */ 1866 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) 1867 FEEDBACK3_to_W_I_2 1868 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) 1869 GAMMA1_1(YMM_TEMP0, W_I_2) 1870 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) 1871 GAMMA1_2(YMM_TEMP0, W_I_2) 1872 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) 1873 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */ 1874 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) 1875 1876 MOVE_to_REG(YMM_TEMP0, K, 40) 1877 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) 1878 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) 1879 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) 1880 ADD(YMM_TEMP0, YMM_TEMP0, W_I) 1881 MOVE_to_MEM(W_K, 40, YMM_TEMP0) 1882 1883 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 1884 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) 1885 GAMMA0_1(W_I_TEMP, W_I_15) 1886 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) 1887 GAMMA0_2(W_I_TEMP, W_I_15) 1888 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) 1889 ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */ 1890 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) 1891 ADD(W_I, W_I_7, W_I_TEMP) 1892 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) 1893 GAMMA1_1(YMM_TEMP0, W_I_2) 1894 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) 1895 GAMMA1_2(YMM_TEMP0, W_I_2) 1896 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) 1897 ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */ 1898 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) 1899 FEEDBACK1_to_W_I_2 1900 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) 1901 FEEDBACK_to_W_I_7 1902 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) 1903 ADD(W_I_TEMP, W_I_7, W_I_TEMP) 1904 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) 1905 GAMMA1_1(YMM_TEMP0, W_I_2) 1906 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) 1907 GAMMA1_2(YMM_TEMP0, W_I_2) 1908 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) 1909 ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */ 1910 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) 1911 FEEDBACK2_to_W_I_2 1912 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) 1913 GAMMA1_1(YMM_TEMP0, W_I_2) 1914 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) 1915 GAMMA1_2(YMM_TEMP0, W_I_2) 1916 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) 1917 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */ 1918 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) 1919 FEEDBACK3_to_W_I_2 1920 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) 1921 GAMMA1_1(YMM_TEMP0, W_I_2) 1922 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) 1923 GAMMA1_2(YMM_TEMP0, W_I_2) 1924 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) 1925 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */ 1926 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) 1927 1928 MOVE_to_REG(YMM_TEMP0, K, 48) 1929 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) 1930 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) 1931 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) 1932 ADD(YMM_TEMP0, YMM_TEMP0, W_I) 1933 MOVE_to_MEM(W_K, 48, YMM_TEMP0) 1934 1935 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 1936 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) 1937 GAMMA0_1(W_I_TEMP, W_I_15) 1938 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) 1939 GAMMA0_2(W_I_TEMP, W_I_15) 1940 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) 1941 ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */ 1942 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) 1943 ADD(W_I, W_I_7, W_I_TEMP) 1944 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) 1945 GAMMA1_1(YMM_TEMP0, W_I_2) 1946 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) 1947 GAMMA1_2(YMM_TEMP0, W_I_2) 1948 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) 1949 ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */ 1950 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) 1951 FEEDBACK1_to_W_I_2 1952 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) 1953 FEEDBACK_to_W_I_7 1954 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) 1955 ADD(W_I_TEMP, W_I_7, W_I_TEMP) 1956 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) 1957 GAMMA1_1(YMM_TEMP0, W_I_2) 1958 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) 1959 GAMMA1_2(YMM_TEMP0, W_I_2) 1960 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) 1961 ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */ 1962 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) 1963 FEEDBACK2_to_W_I_2 1964 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) 1965 GAMMA1_1(YMM_TEMP0, W_I_2) 1966 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) 1967 GAMMA1_2(YMM_TEMP0, W_I_2) 1968 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) 1969 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */ 1970 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) 1971 FEEDBACK3_to_W_I_2 1972 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) 1973 GAMMA1_1(YMM_TEMP0, W_I_2) 1974 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) 1975 GAMMA1_2(YMM_TEMP0, W_I_2) 1976 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) 1977 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */ 1978 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) 1979 1980 MOVE_to_REG(YMM_TEMP0, K, 56) 1981 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) 1982 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) 1983 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) 1984 ADD(YMM_TEMP0, YMM_TEMP0, W_I) 1985 MOVE_to_MEM(W_K, 56, YMM_TEMP0) 1986 1987 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) 1988 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) 1989 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) 1990 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) 1991 1992 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) 1993 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) 1994 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) 1995 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) 1996 1997 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) 1806 "subq $64, %%rsp\n\t" 1807 1808 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 1809 "leaq 32(%[sha256]), %%rax\n\t" 1810 W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) 1811 1812 LOAD_DIGEST() 1813 1814 SET_W_K_XFER_4(0) 1815 "movl %%r9d, " L4 "\n\t" 1816 "rorx $6, %%r12d, " L1 "\n\t" 1817 "xorl %%r10d, " L4 "\n\t" 1818 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1819 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1820 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1821 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1822 1823 SET_W_K_XFER_4(16) 1824 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1825 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1826 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1827 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1828 1829 SET_W_K_XFER_4(32) 1830 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1831 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1832 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1833 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1834 1835 SET_W_K_XFER_4(48) 1836 "xorl " L3 ", " L3 "\n\t" 1837 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1838 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1839 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1840 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1841 /* Prev RND: h += Maj(a,b,c) */ 1842 "addl " L3 ", %%r8d\n\t" 1843 1844 STORE_ADD_DIGEST() 1845 1846 "addq $64, %%rsp\n\t" 1998 1847 1999 1848 : 2000 : [FLIP_16] "m" (mBYTE_FLIP_MASK_16[0]), 2001 [FLIP_15] "m" (mBYTE_FLIP_MASK_15[0]), 2002 [FLIP_7] "m" (mBYTE_FLIP_MASK_7[0]), 2003 [FLIP_2] "m" (mBYTE_FLIP_MASK_2), 2004 [MAPW_7] "m" (mMAPtoW_I_7[0]), 2005 [MAP1W_2] "m" (mMAP1toW_I_2[0]), 2006 [MAP2W_2] "m" (mMAP2toW_I_2[0]), 2007 [MAP3W_2] "m" (mMAP3toW_I_2[0]), 2008 [digest] "m" (sha256->digest), 2009 [buf] "m" (sha256->buffer), 2010 [K] "m" (K), 2011 [W_K] "m" (W_K) 2012 : SSE_REGs, "memory" 1849 : [FLIP] "m" (mBYTE_FLIP_MASK[0]), 1850 [SHUF00BA] "m" (mSHUF_00BA[0]), 1851 [SHUFDC00] "m" (mSHUF_DC00[0]), 1852 [sha256] "r" (sha256), 1853 [K] "m" (K) 1854 : WORK_REGS, STATE_REGS, XMM_REGS, "memory" 2013 1855 ); 2014 2015 #ifdef WOLFSSL_SMALL_STACK2016 XFREE(W_K, NULL, DYNAMIC_TYPE_TMP_BUFFER);2017 #endif2018 1856 2019 1857 return 0; 2020 1858 } 2021 1859 2022 #endif /* HAVE_INTEL_AVX2 */ 1860 SHA256_NOINLINE static int Transform_Sha256_AVX1_RORX_Len(wc_Sha256* sha256, 1861 word32 len) 1862 { 1863 __asm__ __volatile__ ( 1864 1865 "subq $64, %%rsp\n\t" 1866 "movq 120(%[sha256]), %%rax\n\t" 1867 1868 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 1869 LOAD_DIGEST() 1870 1871 "# Start of loop processing a block\n" 1872 "1:\n\t" 1873 1874 W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) 1875 1876 SET_W_K_XFER_4(0) 1877 "movl %%r9d, " L4 "\n\t" 1878 "rorx $6, %%r12d, " L1 "\n\t" 1879 "xorl %%r10d, " L4 "\n\t" 1880 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1881 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1882 MsgSched_RORX(X2, X3, X0, X1, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) 1883 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1884 1885 SET_W_K_XFER_4(16) 1886 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1887 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1888 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1889 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1890 1891 SET_W_K_XFER_4(32) 1892 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1893 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1894 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1895 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1896 1897 SET_W_K_XFER_4(48) 1898 "xorl " L3 ", " L3 "\n\t" 1899 "xorl " L2 ", " L2 "\n\t" 1900 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 1901 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 1902 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 1903 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 1904 /* Prev RND: h += Maj(a,b,c) */ 1905 "addl " L3 ", %%r8d\n\t" 1906 "movq 120(%[sha256]), %%rax\n\t" 1907 1908 ADD_DIGEST() 1909 1910 "addq $64, %%rax\n\t" 1911 "subl $64, %[len]\n\t" 1912 1913 STORE_DIGEST() 1914 1915 "movq %%rax, 120(%[sha256])\n\t" 1916 "jnz 1b\n\t" 1917 1918 "addq $64, %%rsp\n\t" 1919 1920 : 1921 : [FLIP] "m" (mBYTE_FLIP_MASK[0]), 1922 [SHUF00BA] "m" (mSHUF_00BA[0]), 1923 [SHUFDC00] "m" (mSHUF_DC00[0]), 1924 [sha256] "r" (sha256), 1925 [len] "r" (len), 1926 [K] "m" (K) 1927 : WORK_REGS, STATE_REGS, XMM_REGS, "memory" 1928 ); 1929 1930 return 0; 1931 } 1932 #endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_RORX */ 1933 1934 1935 #if defined(HAVE_INTEL_AVX2) 1936 #define Y0 %ymm0 1937 #define Y1 %ymm1 1938 #define Y2 %ymm2 1939 #define Y3 %ymm3 1940 1941 #define YTMP0 %ymm4 1942 #define YTMP1 %ymm5 1943 #define YTMP2 %ymm6 1944 #define YTMP3 %ymm7 1945 #define YTMP4 %ymm8 1946 #define YTMP5 %ymm9 1947 #define YXFER %ymm10 1948 1949 #define SHUF_Y_00BA %ymm11 /* shuffle xBxA -> 00BA */ 1950 #define SHUF_Y_DC00 %ymm12 /* shuffle xDxC -> DC00 */ 1951 #define BYTE_FLIP_Y_MASK %ymm13 1952 1953 #define YMM_REGS "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", \ 1954 "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13" 1955 1956 #define MsgSched_Y(Y0,Y1,Y2,Y3,a,b,c,d,e,f,g,h,_i) \ 1957 RND_STEP_0_1(a,b,c,d,e,f,g,h,_i) \ 1958 VPALIGNR (YTMP1, Y1, Y0, 4) /* YTMP1 = W[-15] */ \ 1959 VPALIGNR (YTMP0, Y3, Y2, 4) /* YTMP0 = W[-7] */ \ 1960 RND_STEP_0_2(a,b,c,d,e,f,g,h,_i) \ 1961 RND_STEP_0_3(a,b,c,d,e,f,g,h,_i) \ 1962 VPSRLD (YTMP2, YTMP1, 7) /* YTMP2 = W[-15] >> 7 */ \ 1963 VPSLLD (YTMP3, YTMP1, 25) /* YTEMP3 = W[-15] << (32-7) */ \ 1964 RND_STEP_0_4(a,b,c,d,e,f,g,h,_i) \ 1965 RND_STEP_0_5(a,b,c,d,e,f,g,h,_i) \ 1966 VPSRLD (YTMP4, YTMP1, 18) /* YTEMP4 = W[-15] >> 18 */ \ 1967 VPSLLD (YTMP5, YTMP1, 14) /* YTEMP5 = W[-15] << (32-18) */ \ 1968 RND_STEP_0_6(a,b,c,d,e,f,g,h,_i) \ 1969 RND_STEP_0_7(a,b,c,d,e,f,g,h,_i) \ 1970 VPOR (YTMP2, YTMP3, YTMP2) /* YTMP2 = W[-15] >>> 7 */ \ 1971 VPOR (YTMP4, YTMP5, YTMP4) /* YTMP4 = W[-15] >>> 18 */ \ 1972 RND_STEP_0_8(a,b,c,d,e,f,g,h,_i) \ 1973 RND_STEP_1_1(h,a,b,c,d,e,f,g,_i+1) \ 1974 RND_STEP_1_2(h,a,b,c,d,e,f,g,_i+1) \ 1975 VPSRLD (YTMP5, YTMP1, 3) /* YTMP4 = W[-15] >> 3 */ \ 1976 VPXOR (YTMP2, YTMP4, YTMP2) /* YTMP2 = W[-15] >>> 7 ^ W[-15] >>> 18 */ \ 1977 RND_STEP_1_3(h,a,b,c,d,e,f,g,_i+1) \ 1978 RND_STEP_1_4(h,a,b,c,d,e,f,g,_i+1) \ 1979 VPXOR (YTMP1, YTMP5, YTMP2) /* YTMP1 = s0 */ \ 1980 VPSHUFD (YTMP2, Y3, 0b11111010) /* YTMP2 = W[-2] {BBAA}*/ \ 1981 RND_STEP_1_5(h,a,b,c,d,e,f,g,_i+1) \ 1982 RND_STEP_1_6(h,a,b,c,d,e,f,g,_i+1) \ 1983 VPSRLD (YTMP4, YTMP2, 10) /* YTMP4 = W[-2] >> 10 {BBAA} */ \ 1984 VPSRLQ (YTMP3, YTMP2, 19) /* YTMP3 = W[-2] MY_ROR 19 {xBxA} */ \ 1985 RND_STEP_1_7(h,a,b,c,d,e,f,g,_i+1) \ 1986 RND_STEP_1_8(h,a,b,c,d,e,f,g,_i+1) \ 1987 RND_STEP_0_1(g,h,a,b,c,d,e,f,_i+2) \ 1988 VPSRLQ (YTMP2, YTMP2, 17) /* YTMP2 = W[-2] MY_ROR 17 {xBxA} */ \ 1989 VPADDD (YTMP0, YTMP0, Y0) \ 1990 RND_STEP_0_2(g,h,a,b,c,d,e,f,_i+2) \ 1991 RND_STEP_0_3(g,h,a,b,c,d,e,f,_i+2) \ 1992 RND_STEP_0_4(g,h,a,b,c,d,e,f,_i+2) \ 1993 VPXOR (YTMP2, YTMP3, YTMP2) \ 1994 VPADDD (YTMP0, YTMP0, YTMP1) /* YTMP0 = W[-16] + W[-7] + s0 */ \ 1995 RND_STEP_0_5(g,h,a,b,c,d,e,f,_i+2) \ 1996 VPXOR (YTMP4, YTMP4, YTMP2) /* YTMP4 = s1 {xBxA} */ \ 1997 RND_STEP_0_6(g,h,a,b,c,d,e,f,_i+2) \ 1998 VPSHUFB (YTMP4, YTMP4, SHUF_Y_00BA) /* YTMP4 = s1 {00BA} */ \ 1999 RND_STEP_0_7(g,h,a,b,c,d,e,f,_i+2) \ 2000 VPADDD (YTMP0, YTMP0, YTMP4) /* YTMP0 = {..., ..., W[1], W[0]} */ \ 2001 RND_STEP_0_8(g,h,a,b,c,d,e,f,_i+2) \ 2002 RND_STEP_1_1(f,g,h,a,b,c,d,e,_i+3) \ 2003 VPSHUFD (YTMP2, YTMP0, 0b01010000) /* YTMP2 = W[-2] {DDCC} */ \ 2004 RND_STEP_1_2(f,g,h,a,b,c,d,e,_i+3) \ 2005 VPSRLQ (YTMP4, YTMP2, 17) /* YTMP4 = W[-2] MY_ROR 17 {xDxC} */ \ 2006 VPSRLQ (YTMP3, YTMP2, 19) /* YTMP3 = W[-2] MY_ROR 19 {xDxC} */ \ 2007 RND_STEP_1_3(f,g,h,a,b,c,d,e,_i+3) \ 2008 RND_STEP_1_4(f,g,h,a,b,c,d,e,_i+3) \ 2009 VPSRLD (YTMP5, YTMP2, 10) /* YTMP5 = W[-2] >> 10 {DDCC} */ \ 2010 VPXOR (YTMP4, YTMP3, YTMP4) \ 2011 RND_STEP_1_5(f,g,h,a,b,c,d,e,_i+3) \ 2012 RND_STEP_1_6(f,g,h,a,b,c,d,e,_i+3) \ 2013 VPXOR (YTMP5, YTMP4, YTMP5) /* YTMP5 = s1 {xDxC} */ \ 2014 RND_STEP_1_7(f,g,h,a,b,c,d,e,_i+3) \ 2015 VPSHUFB (YTMP5, YTMP5, SHUF_Y_DC00) /* YTMP5 = s1 {DC00} */ \ 2016 RND_STEP_1_8(f,g,h,a,b,c,d,e,_i+3) \ 2017 VPADDD (Y0, YTMP5, YTMP0) /* Y0 = {W[3], W[2], W[1], W[0]} */ 2018 2019 #if defined(HAVE_INTEL_RORX) 2020 2021 #define MsgSched_Y_RORX(Y0,Y1,Y2,Y3,a,b,c,d,e,f,g,h,_i) \ 2022 RND_STEP_RORX_0_1(a,b,c,d,e,f,g,h,_i) \ 2023 VPALIGNR (YTMP1, Y1, Y0, 4) /* YTMP1 = W[-15] */ \ 2024 RND_STEP_RORX_0_2(a,b,c,d,e,f,g,h,_i) \ 2025 VPALIGNR (YTMP0, Y3, Y2, 4) /* YTMP0 = W[-7] */ \ 2026 RND_STEP_RORX_0_3(a,b,c,d,e,f,g,h,_i) \ 2027 VPSRLD (YTMP2, YTMP1, 7) /* YTMP2 = W[-15] >> 7 */ \ 2028 RND_STEP_RORX_0_4(a,b,c,d,e,f,g,h,_i) \ 2029 VPSLLD (YTMP3, YTMP1, 25) /* YTEMP3 = W[-15] << (32-7) */ \ 2030 RND_STEP_RORX_0_5(a,b,c,d,e,f,g,h,_i) \ 2031 VPSRLD (YTMP4, YTMP1, 18) /* YTEMP4 = W[-15] >> 18 */ \ 2032 RND_STEP_RORX_0_6(a,b,c,d,e,f,g,h,_i) \ 2033 VPSLLD (YTMP5, YTMP1, 14) /* YTEMP5 = W[-15] << (32-18) */ \ 2034 RND_STEP_RORX_0_7(a,b,c,d,e,f,g,h,_i) \ 2035 VPOR (YTMP2, YTMP2, YTMP3) /* YTMP2 = W[-15] >>> 7 */ \ 2036 RND_STEP_RORX_0_8(a,b,c,d,e,f,g,h,_i) \ 2037 VPOR (YTMP4, YTMP4, YTMP5) /* YTMP4 = W[-15] >>> 18 */ \ 2038 RND_STEP_RORX_1_1(h,a,b,c,d,e,f,g,_i+1) \ 2039 VPSRLD (YTMP5, YTMP1, 3) /* YTMP4 = W[-15] >> 3 */ \ 2040 RND_STEP_RORX_1_2(h,a,b,c,d,e,f,g,_i+1) \ 2041 VPXOR (YTMP2, YTMP2, YTMP4) /* YTMP2 = W[-15] >>> 7 ^ W[-15] >>> 18 */ \ 2042 RND_STEP_RORX_1_3(h,a,b,c,d,e,f,g,_i+1) \ 2043 VPSHUFD (YTMP3, Y3, 0b11111010) /* YTMP2 = W[-2] {BBAA}*/ \ 2044 RND_STEP_RORX_1_4(h,a,b,c,d,e,f,g,_i+1) \ 2045 VPXOR (YTMP1, YTMP5, YTMP2) /* YTMP1 = s0 */ \ 2046 RND_STEP_RORX_1_5(h,a,b,c,d,e,f,g,_i+1) \ 2047 VPSRLD (YTMP4, YTMP3, 10) /* YTMP4 = W[-2] >> 10 {BBAA} */ \ 2048 RND_STEP_RORX_1_6(h,a,b,c,d,e,f,g,_i+1) \ 2049 VPSRLQ (YTMP2, YTMP3, 19) /* YTMP3 = W[-2] MY_ROR 19 {xBxA} */ \ 2050 RND_STEP_RORX_1_7(h,a,b,c,d,e,f,g,_i+1) \ 2051 VPSRLQ (YTMP3, YTMP3, 17) /* YTMP2 = W[-2] MY_ROR 17 {xBxA} */ \ 2052 RND_STEP_RORX_1_8(h,a,b,c,d,e,f,g,_i+1) \ 2053 VPADDD (YTMP0, YTMP0, Y0) \ 2054 RND_STEP_RORX_0_1(g,h,a,b,c,d,e,f,_i+2) \ 2055 VPXOR (YTMP2, YTMP2, YTMP3) \ 2056 RND_STEP_RORX_0_2(g,h,a,b,c,d,e,f,_i+2) \ 2057 VPXOR (YTMP4, YTMP4, YTMP2) /* YTMP4 = s1 {xBxA} */ \ 2058 RND_STEP_RORX_0_3(g,h,a,b,c,d,e,f,_i+2) \ 2059 VPADDD (YTMP0, YTMP0, YTMP1) /* YTMP0 = W[-16] + W[-7] + s0 */ \ 2060 RND_STEP_RORX_0_4(g,h,a,b,c,d,e,f,_i+2) \ 2061 VPSHUFB (YTMP4, YTMP4, SHUF_Y_00BA) /* YTMP4 = s1 {00BA} */ \ 2062 RND_STEP_RORX_0_5(g,h,a,b,c,d,e,f,_i+2) \ 2063 VPADDD (YTMP0, YTMP0, YTMP4) /* YTMP0 = {..., ..., W[1], W[0]} */ \ 2064 RND_STEP_RORX_0_6(g,h,a,b,c,d,e,f,_i+2) \ 2065 VPSHUFD (YTMP2, YTMP0, 0b01010000) /* YTMP2 = W[-2] {DDCC} */ \ 2066 RND_STEP_RORX_0_7(g,h,a,b,c,d,e,f,_i+2) \ 2067 RND_STEP_RORX_0_8(g,h,a,b,c,d,e,f,_i+2) \ 2068 VPSRLQ (YTMP4, YTMP2, 17) /* YTMP4 = W[-2] MY_ROR 17 {xDxC} */ \ 2069 RND_STEP_RORX_1_1(f,g,h,a,b,c,d,e,_i+3) \ 2070 VPSRLQ (YTMP3, YTMP2, 19) /* YTMP3 = W[-2] MY_ROR 19 {xDxC} */ \ 2071 RND_STEP_RORX_1_2(f,g,h,a,b,c,d,e,_i+3) \ 2072 VPSRLD (YTMP5, YTMP2, 10) /* YTMP5 = W[-2] >> 10 {DDCC} */ \ 2073 RND_STEP_RORX_1_3(f,g,h,a,b,c,d,e,_i+3) \ 2074 VPXOR (YTMP4, YTMP4, YTMP3) \ 2075 RND_STEP_RORX_1_4(f,g,h,a,b,c,d,e,_i+3) \ 2076 VPXOR (YTMP5, YTMP5, YTMP4) /* YTMP5 = s1 {xDxC} */ \ 2077 RND_STEP_RORX_1_5(f,g,h,a,b,c,d,e,_i+3) \ 2078 RND_STEP_RORX_1_6(f,g,h,a,b,c,d,e,_i+3) \ 2079 VPSHUFB (YTMP5, YTMP5, SHUF_Y_DC00) /* YTMP5 = s1 {DC00} */ \ 2080 RND_STEP_RORX_1_7(f,g,h,a,b,c,d,e,_i+3) \ 2081 RND_STEP_RORX_1_8(f,g,h,a,b,c,d,e,_i+3) \ 2082 VPADDD (Y0, YTMP5, YTMP0) /* Y0 = {W[3], W[2], W[1], W[0]} */ \ 2083 2084 #endif /* HAVE_INTEL_RORX */ 2085 2086 #define _VINSERTI128(op1,op2,op3,op4) \ 2087 "vinserti128 $" #op4 ", %" #op3 ", %" #op2 ", %" #op1 "\n\t" 2088 #define VINSERTI128(op1,op2,op3,op4) \ 2089 _VINSERTI128(op1,op2,op3,op4) 2090 2091 2092 #define _LOAD_W_K_LOW(BYTE_FLIP_MASK, reg) \ 2093 "# X0, X1, X2, X3 = W[0..15]\n\t" \ 2094 "vmovdqu (%%" #reg "), %%xmm0\n\t" \ 2095 "vmovdqu 16(%%" #reg "), %%xmm1\n\t" \ 2096 VPSHUFB(X0, X0, BYTE_FLIP_MASK) \ 2097 VPSHUFB(X1, X1, BYTE_FLIP_MASK) \ 2098 "vmovdqu 32(%%" #reg "), %%xmm2\n\t" \ 2099 "vmovdqu 48(%%" #reg "), %%xmm3\n\t" \ 2100 VPSHUFB(X2, X2, BYTE_FLIP_MASK) \ 2101 VPSHUFB(X3, X3, BYTE_FLIP_MASK) 2102 2103 #define LOAD_W_K_LOW(BYTE_FLIP_MASK, reg) \ 2104 _LOAD_W_K_LOW(BYTE_FLIP_MASK, reg) 2105 2106 2107 #define _LOAD_W_K(BYTE_FLIP_Y_MASK, reg) \ 2108 "# X0, X1, X2, X3 = W[0..15]\n\t" \ 2109 "vmovdqu (%%" #reg "), %%xmm0\n\t" \ 2110 "vmovdqu 16(%%" #reg "), %%xmm1\n\t" \ 2111 "vmovdqu 64(%%" #reg "), %%xmm4\n\t" \ 2112 "vmovdqu 80(%%" #reg "), %%xmm5\n\t" \ 2113 VINSERTI128(Y0, Y0, XTMP0, 1) \ 2114 VINSERTI128(Y1, Y1, XTMP1, 1) \ 2115 VPSHUFB(Y0, Y0, BYTE_FLIP_Y_MASK) \ 2116 VPSHUFB(Y1, Y1, BYTE_FLIP_Y_MASK) \ 2117 "vmovdqu 32(%%" #reg "), %%xmm2\n\t" \ 2118 "vmovdqu 48(%%" #reg "), %%xmm3\n\t" \ 2119 "vmovdqu 96(%%" #reg "), %%xmm6\n\t" \ 2120 "vmovdqu 112(%%" #reg "), %%xmm7\n\t" \ 2121 VINSERTI128(Y2, Y2, XTMP2, 1) \ 2122 VINSERTI128(Y3, Y3, XTMP3, 1) \ 2123 VPSHUFB(Y2, Y2, BYTE_FLIP_Y_MASK) \ 2124 VPSHUFB(Y3, Y3, BYTE_FLIP_Y_MASK) 2125 2126 #define LOAD_W_K(BYTE_FLIP_Y_MASK, reg) \ 2127 _LOAD_W_K(BYTE_FLIP_Y_MASK, reg) 2128 2129 2130 #define _SET_W_Y_4(i) \ 2131 "vpaddd (" #i "*8)+ 0+%[K], %%ymm0, %%ymm4\n\t" \ 2132 "vpaddd (" #i "*8)+32+%[K], %%ymm1, %%ymm5\n\t" \ 2133 "vmovdqu %%ymm4, (" #i "*8)+ 0(" WK ")\n\t" \ 2134 "vmovdqu %%ymm5, (" #i "*8)+32(" WK ")\n\t" \ 2135 "vpaddd (" #i "*8)+64+%[K], %%ymm2, %%ymm4\n\t" \ 2136 "vpaddd (" #i "*8)+96+%[K], %%ymm3, %%ymm5\n\t" \ 2137 "vmovdqu %%ymm4, (" #i "*8)+64(" WK ")\n\t" \ 2138 "vmovdqu %%ymm5, (" #i "*8)+96(" WK ")\n\t" 2139 2140 #define SET_W_Y_4(i) \ 2141 _SET_W_Y_4(i) 2142 2143 2144 static const ALIGN32 word64 mSHUF_Y_00BA[] = 2145 { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF, 2146 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */ 2147 static const ALIGN32 word64 mSHUF_Y_DC00[] = 2148 { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100, 2149 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */ 2150 static const ALIGN32 word64 mBYTE_FLIP_Y_MASK[] = 2151 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 2152 0x0405060700010203, 0x0c0d0e0f08090a0b }; 2153 2154 #define _INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) \ 2155 "vmovdqa %[FLIP], %" #BYTE_FLIP_MASK "\n\t" \ 2156 "vmovdqa %[SHUF00BA], %" #SHUF_00BA "\n\t" \ 2157 "vmovdqa %[SHUFDC00], %" #SHUF_DC00 "\n\t" 2158 2159 #define INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) \ 2160 _INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 2161 2162 static const ALIGN32 word32 K256[128] = { 2163 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 2164 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 2165 0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 2166 0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 2167 0xD807AA98L, 0x12835B01L, 0x243185BEL, 0x550C7DC3L, 2168 0xD807AA98L, 0x12835B01L, 0x243185BEL, 0x550C7DC3L, 2169 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, 0xC19BF174L, 2170 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, 0xC19BF174L, 2171 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, 2172 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, 2173 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 2174 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 2175 0x983E5152L, 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 2176 0x983E5152L, 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 2177 0xC6E00BF3L, 0xD5A79147L, 0x06CA6351L, 0x14292967L, 2178 0xC6E00BF3L, 0xD5A79147L, 0x06CA6351L, 0x14292967L, 2179 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, 0x53380D13L, 2180 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, 0x53380D13L, 2181 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, 2182 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, 2183 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 2184 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 2185 0xD192E819L, 0xD6990624L, 0xF40E3585L, 0x106AA070L, 2186 0xD192E819L, 0xD6990624L, 0xF40E3585L, 0x106AA070L, 2187 0x19A4C116L, 0x1E376C08L, 0x2748774CL, 0x34B0BCB5L, 2188 0x19A4C116L, 0x1E376C08L, 0x2748774CL, 0x34B0BCB5L, 2189 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, 0x682E6FF3L, 2190 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, 0x682E6FF3L, 2191 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, 2192 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, 2193 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L, 2194 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L 2195 }; 2196 2197 SHA256_NOINLINE static int Transform_Sha256_AVX2(wc_Sha256* sha256) 2198 { 2199 __asm__ __volatile__ ( 2200 2201 "subq $512, %%rsp\n\t" 2202 "leaq 32(%[sha256]), %%rax\n\t" 2203 2204 INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_Y_00BA, SHUF_Y_DC00) 2205 LOAD_DIGEST() 2206 2207 LOAD_W_K_LOW(BYTE_FLIP_MASK, rax) 2208 2209 "movl %%r9d, " L4 "\n\t" 2210 "movl %%r12d, " L1 "\n\t" 2211 "xorl %%r10d, " L4 "\n\t" 2212 2213 SET_W_Y_4(0) 2214 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 2215 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) 2216 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16) 2217 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24) 2218 2219 SET_W_Y_4(16) 2220 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32) 2221 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40) 2222 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48) 2223 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56) 2224 2225 SET_W_Y_4(32) 2226 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64) 2227 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72) 2228 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80) 2229 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88) 2230 2231 SET_W_Y_4(48) 2232 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96) 2233 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104) 2234 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112) 2235 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120) 2236 2237 STORE_ADD_DIGEST() 2238 2239 "addq $512, %%rsp\n\t" 2240 2241 : 2242 : [FLIP] "m" (mBYTE_FLIP_MASK[0]), 2243 [SHUF00BA] "m" (mSHUF_Y_00BA[0]), 2244 [SHUFDC00] "m" (mSHUF_Y_DC00[0]), 2245 [sha256] "r" (sha256), 2246 [K] "m" (K256) 2247 : WORK_REGS, STATE_REGS, YMM_REGS, "memory" 2248 ); 2249 2250 return 0; 2251 } 2252 2253 SHA256_NOINLINE static int Transform_Sha256_AVX2_Len(wc_Sha256* sha256, 2254 word32 len) 2255 { 2256 if ((len & WC_SHA256_BLOCK_SIZE) != 0) { 2257 XMEMCPY(sha256->buffer, sha256->data, WC_SHA256_BLOCK_SIZE); 2258 Transform_Sha256_AVX2(sha256); 2259 sha256->data += WC_SHA256_BLOCK_SIZE; 2260 len -= WC_SHA256_BLOCK_SIZE; 2261 if (len == 0) 2262 return 0; 2263 } 2264 2265 __asm__ __volatile__ ( 2266 2267 "subq $512, %%rsp\n\t" 2268 "movq 120(%[sha256]), %%rax\n\t" 2269 2270 INIT_MASKS_Y(BYTE_FLIP_Y_MASK, SHUF_Y_00BA, SHUF_Y_DC00) 2271 LOAD_DIGEST() 2272 2273 "# Start of loop processing two blocks\n" 2274 "1:\n\t" 2275 2276 LOAD_W_K(BYTE_FLIP_Y_MASK, rax) 2277 2278 "movl %%r9d, " L4 "\n\t" 2279 "movl %%r12d, " L1 "\n\t" 2280 "xorl %%r10d, " L4 "\n\t" 2281 2282 SET_W_Y_4(0) 2283 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 2284 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) 2285 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16) 2286 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24) 2287 2288 SET_W_Y_4(16) 2289 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32) 2290 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40) 2291 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48) 2292 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56) 2293 2294 SET_W_Y_4(32) 2295 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64) 2296 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72) 2297 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80) 2298 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88) 2299 2300 SET_W_Y_4(48) 2301 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96) 2302 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104) 2303 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112) 2304 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120) 2305 2306 ADD_DIGEST() 2307 STORE_DIGEST() 2308 2309 "movl %%r9d, " L4 "\n\t" 2310 "movl %%r12d, " L1 "\n\t" 2311 "xorl %%r10d, " L4 "\n\t" 2312 2313 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 4) 2314 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 2315 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 20) 2316 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 28) 2317 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 36) 2318 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 44) 2319 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 52) 2320 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 60) 2321 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 68) 2322 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 76) 2323 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 84) 2324 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 92) 2325 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 100) 2326 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 108) 2327 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 116) 2328 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 124) 2329 2330 ADD_DIGEST() 2331 2332 "movq 120(%[sha256]), %%rax\n\t" 2333 "addq $128, %%rax\n\t" 2334 "subl $128, %[len]\n\t" 2335 2336 STORE_DIGEST() 2337 2338 "movq %%rax, 120(%[sha256])\n\t" 2339 "jnz 1b\n\t" 2340 2341 "addq $512, %%rsp\n\t" 2342 2343 : 2344 : [FLIP] "m" (mBYTE_FLIP_Y_MASK[0]), 2345 [SHUF00BA] "m" (mSHUF_Y_00BA[0]), 2346 [SHUFDC00] "m" (mSHUF_Y_DC00[0]), 2347 [sha256] "r" (sha256), 2348 [len] "r" (len), 2349 [K] "m" (K256) 2350 : WORK_REGS, STATE_REGS, YMM_REGS, "memory" 2351 ); 2352 2353 return 0; 2354 } 2355 2356 #if defined(HAVE_INTEL_RORX) 2357 SHA256_NOINLINE static int Transform_Sha256_AVX2_RORX(wc_Sha256* sha256) 2358 { 2359 __asm__ __volatile__ ( 2360 2361 "subq $512, %%rsp\n\t" 2362 "leaq 32(%[sha256]), %%rax\n\t" 2363 2364 INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_Y_00BA, SHUF_Y_DC00) 2365 LOAD_W_K_LOW(BYTE_FLIP_MASK, rax) 2366 2367 LOAD_DIGEST() 2368 2369 "movl %%r9d, " L4 "\n\t" 2370 "rorx $6, %%r12d, " L1 "\n\t" 2371 "xorl %%r10d, " L4 "\n\t" 2372 2373 SET_W_Y_4(0) 2374 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 2375 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) 2376 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16) 2377 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24) 2378 2379 SET_W_Y_4(16) 2380 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32) 2381 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40) 2382 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48) 2383 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56) 2384 2385 SET_W_Y_4(32) 2386 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64) 2387 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72) 2388 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80) 2389 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88) 2390 2391 SET_W_Y_4(48) 2392 "xorl " L3 ", " L3 "\n\t" 2393 "xorl " L2 ", " L2 "\n\t" 2394 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96) 2395 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104) 2396 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112) 2397 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120) 2398 /* Prev RND: h += Maj(a,b,c) */ 2399 "addl " L3 ", %%r8d\n\t" 2400 2401 STORE_ADD_DIGEST() 2402 2403 "addq $512, %%rsp\n\t" 2404 2405 : 2406 : [FLIP] "m" (mBYTE_FLIP_MASK[0]), 2407 [SHUF00BA] "m" (mSHUF_Y_00BA[0]), 2408 [SHUFDC00] "m" (mSHUF_Y_DC00[0]), 2409 [sha256] "r" (sha256), 2410 [K] "m" (K256) 2411 : WORK_REGS, STATE_REGS, YMM_REGS, "memory" 2412 ); 2413 2414 return 0; 2415 } 2416 2417 SHA256_NOINLINE static int Transform_Sha256_AVX2_RORX_Len(wc_Sha256* sha256, 2418 word32 len) 2419 { 2420 if ((len & WC_SHA256_BLOCK_SIZE) != 0) { 2421 XMEMCPY(sha256->buffer, sha256->data, WC_SHA256_BLOCK_SIZE); 2422 Transform_Sha256_AVX2_RORX(sha256); 2423 sha256->data += WC_SHA256_BLOCK_SIZE; 2424 len -= WC_SHA256_BLOCK_SIZE; 2425 if (len == 0) 2426 return 0; 2427 } 2428 2429 __asm__ __volatile__ ( 2430 2431 "subq $512, %%rsp\n\t" 2432 "movq 120(%[sha256]), %%rax\n\t" 2433 2434 INIT_MASKS_Y(BYTE_FLIP_Y_MASK, SHUF_Y_00BA, SHUF_Y_DC00) 2435 LOAD_DIGEST() 2436 2437 "# Start of loop processing two blocks\n" 2438 "1:\n\t" 2439 2440 LOAD_W_K(BYTE_FLIP_Y_MASK, rax) 2441 2442 "movl %%r9d, " L4 "\n\t" 2443 "rorx $6, %%r12d, " L1 "\n\t" 2444 "xorl %%r10d, " L4 "\n\t" 2445 2446 SET_W_Y_4(0) 2447 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 2448 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) 2449 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16) 2450 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24) 2451 2452 SET_W_Y_4(16) 2453 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32) 2454 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40) 2455 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48) 2456 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56) 2457 2458 SET_W_Y_4(32) 2459 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64) 2460 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72) 2461 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80) 2462 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88) 2463 2464 SET_W_Y_4(48) 2465 "xorl " L3 ", " L3 "\n\t" 2466 "xorl " L2 ", " L2 "\n\t" 2467 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96) 2468 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104) 2469 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112) 2470 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120) 2471 /* Prev RND: h += Maj(a,b,c) */ 2472 "addl " L3 ", %%r8d\n\t" 2473 "xorl " L2 ", " L2 "\n\t" 2474 2475 ADD_DIGEST() 2476 STORE_DIGEST() 2477 2478 "movl %%r9d, " L4 "\n\t" 2479 "xorl " L3 ", " L3 "\n\t" 2480 "xorl %%r10d, " L4 "\n\t" 2481 2482 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 4) 2483 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 2484 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 20) 2485 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 28) 2486 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 36) 2487 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 44) 2488 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 52) 2489 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 60) 2490 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 68) 2491 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 76) 2492 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 84) 2493 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 92) 2494 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 100) 2495 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 108) 2496 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 116) 2497 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 124) 2498 /* Prev RND: h += Maj(a,b,c) */ 2499 "addl " L3 ", %%r8d\n\t" 2500 "movq 120(%[sha256]), %%rax\n\t" 2501 2502 ADD_DIGEST() 2503 2504 "addq $128, %%rax\n\t" 2505 "subl $128, %[len]\n\t" 2506 2507 STORE_DIGEST() 2508 2509 "movq %%rax, 120(%[sha256])\n\t" 2510 "jnz 1b\n\t" 2511 2512 "addq $512, %%rsp\n\t" 2513 2514 : 2515 : [FLIP] "m" (mBYTE_FLIP_Y_MASK[0]), 2516 [SHUF00BA] "m" (mSHUF_Y_00BA[0]), 2517 [SHUFDC00] "m" (mSHUF_Y_DC00[0]), 2518 [sha256] "r" (sha256), 2519 [len] "r" (len), 2520 [K] "m" (K256) 2521 : WORK_REGS, STATE_REGS, YMM_REGS, "memory" 2522 ); 2523 2524 return 0; 2525 } 2526 #endif /* HAVE_INTEL_RORX */ 2527 #endif /* HAVE_INTEL_AVX2 */ 2023 2528 2024 2529 2025 2530 #ifdef WOLFSSL_SHA224 2026 2531 2027 #ifdef STM32_HASH 2028 2029 #define Sha256Update Sha224Update 2030 #define Sha256Final Sha224Final 2031 2032 /* 2033 * STM32F2/F4/F7 hardware SHA224 support through the HASH_* API's from the 2034 * Standard Peripheral Library or CubeMX (See note in README). 2035 */ 2036 2037 /* STM32 register size, bytes */ 2038 #ifdef WOLFSSL_STM32_CUBEMX 2039 #define SHA224_REG_SIZE WC_SHA224_BLOCK_SIZE 2040 #else 2041 #define SHA224_REG_SIZE 4 2042 /* STM32 struct notes: 2043 * sha224->buffer = first 4 bytes used to hold partial block if needed 2044 * sha224->buffLen = num bytes currently stored in sha256->buffer 2045 * sha224->loLen = num bytes that have been written to STM32 FIFO 2046 */ 2047 #endif 2048 #define SHA224_HW_TIMEOUT 0xFF 2049 2050 static int InitSha224(wc_Sha224* sha224) 2532 #ifdef STM32_HASH_SHA2 2533 2534 /* Supports CubeMX HAL or Standard Peripheral Library */ 2535 2536 int wc_InitSha224_ex(wc_Sha224* sha224, void* heap, int devId) 2051 2537 { 2052 2538 if (sha224 == NULL) 2053 2539 return BAD_FUNC_ARG; 2054 2540 2055 XMEMSET(sha224->buffer, 0, sizeof(sha224->buffer)); 2056 sha224->buffLen = 0; 2057 sha224->loLen = 0; 2058 sha224->hiLen = 0; 2059 2060 /* initialize HASH peripheral */ 2061 #ifdef WOLFSSL_STM32_CUBEMX 2062 HAL_HASH_DeInit(&sha224->hashHandle); 2063 sha224->hashHandle.Init.DataType = HASH_DATATYPE_8B; 2064 if (HAL_HASH_Init(&sha224->hashHandle) != HAL_OK) { 2065 return ASYNC_INIT_E; 2066 } 2067 /* required because Cube MX is not clearing algo bits */ 2068 HASH->CR &= ~HASH_CR_ALGO; 2069 #else 2070 HASH_DeInit(); 2071 2072 /* reset the hash control register */ 2073 /* required because Cube MX is not clearing algo bits */ 2074 HASH->CR &= ~ (HASH_CR_ALGO | HASH_CR_DATATYPE | HASH_CR_MODE); 2075 2076 /* configure algo used, algo mode, datatype */ 2077 HASH->CR |= (HASH_AlgoSelection_SHA224 | HASH_AlgoMode_HASH 2078 | HASH_DataType_8b); 2079 2080 /* reset HASH processor */ 2081 HASH->CR |= HASH_CR_INIT; 2082 #endif 2083 2541 (void)devId; 2542 (void)heap; 2543 2544 wc_Stm32_Hash_Init(&sha224->stmCtx); 2084 2545 return 0; 2085 2546 } 2086 2547 2087 static int Sha224Update(wc_Sha256* sha224, const byte* data, word32 len)2548 int wc_Sha224Update(wc_Sha224* sha224, const byte* data, word32 len) 2088 2549 { 2089 2550 int ret = 0; 2090 byte* local; 2091 2092 /* do block size increments */ 2093 local = (byte*)sha224->buffer; 2094 2095 /* check that internal buffLen is valid */ 2096 if (sha224->buffLen >= SHA224_REG_SIZE) 2097 return BUFFER_E; 2098 2099 while (len) { 2100 word32 add = min(len, SHA224_REG_SIZE - sha224->buffLen); 2101 XMEMCPY(&local[sha224->buffLen], data, add); 2102 2103 sha224->buffLen += add; 2104 data += add; 2105 len -= add; 2106 2107 if (sha224->buffLen == SHA224_REG_SIZE) { 2108 #ifdef WOLFSSL_STM32_CUBEMX 2109 if (HAL_HASHEx_SHA224_Accumulate( 2110 &sha224->hashHandle, local, SHA224_REG_SIZE) != HAL_OK) { 2111 ret = ASYNC_OP_E; 2112 } 2113 #else 2114 HASH_DataIn(*(uint32_t*)local); 2115 #endif 2116 2117 AddLength(sha224, SHA224_REG_SIZE); 2118 sha224->buffLen = 0; 2551 2552 if (sha224 == NULL || (data == NULL && len > 0)) { 2553 return BAD_FUNC_ARG; 2119 2554 } 2555 2556 ret = wolfSSL_CryptHwMutexLock(); 2557 if (ret == 0) { 2558 ret = wc_Stm32_Hash_Update(&sha224->stmCtx, 2559 HASH_AlgoSelection_SHA224, data, len); 2560 wolfSSL_CryptHwMutexUnLock(); 2120 2561 } 2121 2562 return ret; 2122 2563 } 2123 2564 2124 static int Sha224Final(wc_Sha256* sha224)2565 int wc_Sha224Final(wc_Sha224* sha224, byte* hash) 2125 2566 { 2126 2567 int ret = 0; 2127 2568 2128 #ifdef WOLFSSL_STM32_CUBEMX 2129 if (HAL_HASHEx_SHA224_Start(&sha224->hashHandle, 2130 (byte*)sha224->buffer, sha224->buffLen, 2131 (byte*)sha224->digest, SHA224_HW_TIMEOUT) != HAL_OK) { 2132 ret = ASYNC_OP_E; 2133 } 2134 #else 2135 __IO uint16_t nbvalidbitsdata = 0; 2136 2137 /* finish reading any trailing bytes into FIFO */ 2138 if (sha224->buffLen > 0) { 2139 HASH_DataIn(*(uint32_t*)sha224->buffer); 2140 AddLength(sha224, sha224->buffLen); 2141 } 2142 2143 /* calculate number of valid bits in last word of input data */ 2144 nbvalidbitsdata = 8 * (sha224->loLen % SHA224_REG_SIZE); 2145 2146 /* configure number of valid bits in last word of the data */ 2147 HASH_SetLastWordValidBitsNbr(nbvalidbitsdata); 2148 2149 /* start HASH processor */ 2150 HASH_StartDigest(); 2151 2152 /* wait until Busy flag == RESET */ 2153 while (HASH_GetFlagStatus(HASH_FLAG_BUSY) != RESET) {} 2154 2155 /* read message digest */ 2156 sha224->digest[0] = HASH->HR[0]; 2157 sha224->digest[1] = HASH->HR[1]; 2158 sha224->digest[2] = HASH->HR[2]; 2159 sha224->digest[3] = HASH->HR[3]; 2160 sha224->digest[4] = HASH->HR[4]; 2161 sha224->digest[5] = HASH_DIGEST->HR[5]; 2162 sha224->digest[6] = HASH_DIGEST->HR[6]; 2163 2164 ByteReverseWords(sha224->digest, sha224->digest, SHA224_DIGEST_SIZE); 2165 #endif /* WOLFSSL_STM32_CUBEMX */ 2569 if (sha224 == NULL || hash == NULL) { 2570 return BAD_FUNC_ARG; 2571 } 2572 2573 ret = wolfSSL_CryptHwMutexLock(); 2574 if (ret == 0) { 2575 ret = wc_Stm32_Hash_Final(&sha224->stmCtx, 2576 HASH_AlgoSelection_SHA224, hash, WC_SHA224_DIGEST_SIZE); 2577 wolfSSL_CryptHwMutexUnLock(); 2578 } 2579 2580 (void)wc_InitSha224(sha224); /* reset state */ 2166 2581 2167 2582 return ret; 2168 2583 } 2169 2584 2585 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH) 2586 /* functions defined in wolfcrypt/src/port/caam/caam_sha256.c */ 2587 2588 #elif defined(WOLFSSL_AFALG_HASH) 2589 #error SHA224 currently not supported with AF_ALG enabled 2590 2591 #elif defined(WOLFSSL_DEVCRYPTO_HASH) 2592 /* implemented in wolfcrypt/src/port/devcrypto/devcrypt_hash.c */ 2593 2170 2594 #else 2595 2596 #define NEED_SOFT_SHA224 2597 2171 2598 2172 2599 static int InitSha224(wc_Sha224* sha224) … … 2199 2626 } 2200 2627 2201 #endif /* STM32_HASH */ 2202 2628 #endif 2629 2630 #ifdef NEED_SOFT_SHA224 2203 2631 int wc_InitSha224_ex(wc_Sha224* sha224, void* heap, int devId) 2204 2632 { … … 2213 2641 if (ret != 0) 2214 2642 return ret; 2643 2644 #ifdef WOLFSSL_SMALL_STACK_CACHE 2645 sha224->W = NULL; 2646 #endif 2215 2647 2216 2648 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224) … … 2224 2656 } 2225 2657 2226 int wc_InitSha224(wc_Sha224* sha224)2227 {2228 return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID);2229 }2230 2231 2658 int wc_Sha224Update(wc_Sha224* sha224, const byte* data, word32 len) 2232 2659 { … … 2271 2698 return ret; 2272 2699 2273 #if defined(LITTLE_ENDIAN_ORDER) && !defined(STM32_HASH)2700 #if defined(LITTLE_ENDIAN_ORDER) 2274 2701 ByteReverseWords(sha224->digest, sha224->digest, WC_SHA224_DIGEST_SIZE); 2275 2702 #endif … … 2277 2704 2278 2705 return InitSha224(sha224); /* reset state */ 2706 } 2707 #endif /* end of SHA224 software implementation */ 2708 2709 int wc_InitSha224(wc_Sha224* sha224) 2710 { 2711 return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID); 2279 2712 } 2280 2713 … … 2283 2716 if (sha224 == NULL) 2284 2717 return; 2718 2719 #ifdef WOLFSSL_SMALL_STACK_CACHE 2720 if (sha224->W != NULL) { 2721 XFREE(sha224->W, NULL, DYNAMIC_TYPE_DIGEST); 2722 sha224->W = NULL; 2723 } 2724 #endif 2285 2725 2286 2726 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224) 2287 2727 wolfAsync_DevCtxFree(&sha224->asyncDev, WOLFSSL_ASYNC_MARKER_SHA224); 2288 2728 #endif /* WOLFSSL_ASYNC_CRYPT */ 2289 } 2290 2729 2730 #ifdef WOLFSSL_PIC32MZ_HASH 2731 wc_Sha256Pic32Free(sha224); 2732 #endif 2733 } 2291 2734 #endif /* WOLFSSL_SHA224 */ 2292 2735 … … 2302 2745 return; 2303 2746 2747 #ifdef WOLFSSL_SMALL_STACK_CACHE 2748 if (sha256->W != NULL) { 2749 XFREE(sha256->W, NULL, DYNAMIC_TYPE_DIGEST); 2750 sha256->W = NULL; 2751 } 2752 #endif 2753 2304 2754 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256) 2305 2755 wolfAsync_DevCtxFree(&sha256->asyncDev, WOLFSSL_ASYNC_MARKER_SHA256); 2306 2756 #endif /* WOLFSSL_ASYNC_CRYPT */ 2757 #ifdef WOLFSSL_PIC32MZ_HASH 2758 wc_Sha256Pic32Free(sha256); 2759 #endif 2760 #if defined(WOLFSSL_AFALG_HASH) 2761 if (sha256->alFd > 0) { 2762 close(sha256->alFd); 2763 sha256->alFd = -1; /* avoid possible double close on socket */ 2764 } 2765 if (sha256->rdFd > 0) { 2766 close(sha256->rdFd); 2767 sha256->rdFd = -1; /* avoid possible double close on socket */ 2768 } 2769 #endif /* WOLFSSL_AFALG_HASH */ 2770 #ifdef WOLFSSL_DEVCRYPTO_HASH 2771 wc_DevCryptoFree(&sha256->ctx); 2772 #endif /* WOLFSSL_DEVCRYPTO */ 2773 #if defined(WOLFSSL_AFALG_HASH_KEEP) || \ 2774 (defined(WOLFSSL_DEVCRYPTO_HASH) && defined(WOLFSSL_DEVCRYPTO_HASH_KEEP)) 2775 if (sha256->msg != NULL) { 2776 XFREE(sha256->msg, sha256->heap, DYNAMIC_TYPE_TMP_BUFFER); 2777 sha256->msg = NULL; 2778 } 2779 #endif 2307 2780 } 2308 2781 … … 2324 2797 if (ret == 0) { 2325 2798 ret = wc_Sha224Final(&tmpSha224, hash); 2799 wc_Sha224Free(&tmpSha224); 2326 2800 } 2327 2801 return ret; … … 2335 2809 2336 2810 XMEMCPY(dst, src, sizeof(wc_Sha224)); 2811 #ifdef WOLFSSL_SMALL_STACK_CACHE 2812 dst->W = NULL; 2813 #endif 2337 2814 2338 2815 #ifdef WOLFSSL_ASYNC_CRYPT … … 2343 2820 } 2344 2821 #endif /* WOLFSSL_SHA224 */ 2822 2823 #ifdef WOLFSSL_AFALG_HASH 2824 /* implemented in wolfcrypt/src/port/af_alg/afalg_hash.c */ 2825 2826 #elif defined(WOLFSSL_DEVCRYPTO_HASH) 2827 /* implemented in wolfcrypt/src/port/devcrypto/devcrypt_hash.c */ 2828 2829 #else 2345 2830 2346 2831 int wc_Sha256GetHash(wc_Sha256* sha256, byte* hash) … … 2355 2840 if (ret == 0) { 2356 2841 ret = wc_Sha256Final(&tmpSha256, hash); 2842 wc_Sha256Free(&tmpSha256); 2357 2843 } 2358 2844 return ret; … … 2366 2852 2367 2853 XMEMCPY(dst, src, sizeof(wc_Sha256)); 2854 #ifdef WOLFSSL_SMALL_STACK_CACHE 2855 dst->W = NULL; 2856 #endif 2368 2857 2369 2858 #ifdef WOLFSSL_ASYNC_CRYPT … … 2376 2865 return ret; 2377 2866 } 2867 #endif 2378 2868 #endif /* !WOLFSSL_TI_HASH */ 2379 2869
Note:
See TracChangeset
for help on using the changeset viewer.