- Timestamp:
- Feb 7, 2019, 8:36:33 AM (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
asp3_tinet_ecnl_rx/trunk/wolfssl-3.12.2/wolfcrypt/src/sha512.c
r337 r372 27 27 #include <wolfssl/wolfcrypt/settings.h> 28 28 29 #ifdef WOLFSSL_SHA512 29 #if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384) 30 31 #if defined(HAVE_FIPS) && \ 32 defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2) 33 34 /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */ 35 #define FIPS_NO_WRAPPERS 36 37 #ifdef USE_WINDOWS_API 38 #pragma code_seg(".fipsA$k") 39 #pragma const_seg(".fipsB$k") 40 #endif 41 #endif 42 30 43 #include <wolfssl/wolfcrypt/sha512.h> 31 44 #include <wolfssl/wolfcrypt/error-crypt.h> 32 45 #include <wolfssl/wolfcrypt/cpuid.h> 33 46 47 /* deprecated USE_SLOW_SHA2 (replaced with USE_SLOW_SHA512) */ 48 #if defined(USE_SLOW_SHA2) && !defined(USE_SLOW_SHA512) 49 #define USE_SLOW_SHA512 50 #endif 51 34 52 /* fips wrapper calls, user can call direct */ 35 #ifdef HAVE_FIPS 53 #if defined(HAVE_FIPS) && \ 54 (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION < 2)) 55 56 #ifdef WOLFSSL_SHA512 57 36 58 int wc_InitSha512(wc_Sha512* sha) 37 59 { … … 72 94 /* Not supported in FIPS */ 73 95 } 96 #endif 74 97 75 98 #if defined(WOLFSSL_SHA384) || defined(HAVE_AESGCM) … … 111 134 #endif /* WOLFSSL_SHA384 || HAVE_AESGCM */ 112 135 113 #else /* else build without using fips*/136 #else /* else build without fips, or for FIPS v2 */ 114 137 115 138 #include <wolfssl/wolfcrypt/logging.h> … … 124 147 125 148 #if defined(USE_INTEL_SPEEDUP) 149 #if defined(__GNUC__) && ((__GNUC__ < 4) || \ 150 (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) 151 #undef NO_AVX2_SUPPORT 152 #define NO_AVX2_SUPPORT 153 #endif 154 #if defined(__clang__) && ((__clang_major__ < 3) || \ 155 (__clang_major__ == 3 && __clang_minor__ <= 5)) 156 #define NO_AVX2_SUPPORT 157 #elif defined(__clang__) && defined(NO_AVX2_SUPPORT) 158 #undef NO_AVX2_SUPPORT 159 #endif 160 126 161 #define HAVE_INTEL_AVX1 162 #ifndef NO_AVX2_SUPPORT 127 163 #define HAVE_INTEL_AVX2 164 #endif 128 165 #endif 129 166 … … 136 173 /* #define DEBUG_YMM */ 137 174 #endif 138 139 140 #if defined(HAVE_INTEL_RORX)141 #define ROTR(func, bits, x) \142 word64 func(word64 x) { word64 ret ;\143 __asm__ ("rorx $"#bits", %1, %0\n\t":"=r"(ret):"r"(x)) ;\144 return ret ;\145 }146 147 static INLINE ROTR(rotrFixed64_28, 28, x);148 static INLINE ROTR(rotrFixed64_34, 34, x);149 static INLINE ROTR(rotrFixed64_39, 39, x);150 static INLINE ROTR(rotrFixed64_14, 14, x);151 static INLINE ROTR(rotrFixed64_18, 18, x);152 static INLINE ROTR(rotrFixed64_41, 41, x);153 154 #define S0_RORX(x) (rotrFixed64_28(x)^rotrFixed64_34(x)^rotrFixed64_39(x))155 #define S1_RORX(x) (rotrFixed64_14(x)^rotrFixed64_18(x)^rotrFixed64_41(x))156 #endif /* HAVE_INTEL_RORX */157 175 158 176 #if defined(HAVE_BYTEREVERSE64) && \ … … 167 185 #endif 168 186 187 #if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH) 188 /* functions defined in wolfcrypt/src/port/caam/caam_sha.c */ 189 #else 190 191 #ifdef WOLFSSL_SHA512 192 169 193 static int InitSha512(wc_Sha512* sha512) 170 194 { … … 188 212 } 189 213 214 #endif /* WOLFSSL_SHA512 */ 190 215 191 216 /* Hardware Acceleration */ 192 217 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 218 219 #ifdef WOLFSSL_SHA512 193 220 194 221 /***** … … 208 235 209 236 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) 210 Transform_ AVX1(); # Function prototype211 Transform_ AVX2(); #237 Transform_Sha512_AVX1(); # Function prototype 238 Transform_Sha512_AVX2(); # 212 239 #endif 213 240 214 _Transform () { # Native Transform Function body241 _Transform_Sha512() { # Native Transform Function body 215 242 216 243 } … … 241 268 #if defnied(HAVE_INTEL_AVX1) 242 269 243 int Transform_ AVX1() {270 int Transform_Sha512_AVX1() { 244 271 Stitched Message Sched/Round 245 272 } … … 249 276 #if defnied(HAVE_INTEL_AVX2) 250 277 251 int Transform_ AVX2() {278 int Transform_Sha512_AVX2() { 252 279 Stitched Message Sched/Round 253 280 } … … 262 289 263 290 #if defined(HAVE_INTEL_AVX1) 264 static int Transform_AVX1(wc_Sha512 *sha512); 291 static int Transform_Sha512_AVX1(wc_Sha512 *sha512); 292 static int Transform_Sha512_AVX1_Len(wc_Sha512 *sha512, word32 len); 265 293 #endif 266 294 #if defined(HAVE_INTEL_AVX2) 267 static int Transform_AVX2(wc_Sha512 *sha512); 268 #if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX) 269 static int Transform_AVX1_RORX(wc_Sha512 *sha512); 295 static int Transform_Sha512_AVX2(wc_Sha512 *sha512); 296 static int Transform_Sha512_AVX2_Len(wc_Sha512 *sha512, word32 len); 297 #if defined(HAVE_INTEL_RORX) 298 static int Transform_Sha512_AVX1_RORX(wc_Sha512 *sha512); 299 static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512 *sha512, 300 word32 len); 301 static int Transform_Sha512_AVX2_RORX(wc_Sha512 *sha512); 302 static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512 *sha512, 303 word32 len); 270 304 #endif 271 305 #endif 272 static int _Transform(wc_Sha512 *sha512); 273 static int (*Transform_p)(wc_Sha512* sha512) = _Transform; 306 static int _Transform_Sha512(wc_Sha512 *sha512); 307 static int (*Transform_Sha512_p)(wc_Sha512* sha512) = _Transform_Sha512; 308 static int (*Transform_Sha512_Len_p)(wc_Sha512* sha512, word32 len) = NULL; 274 309 static int transform_check = 0; 275 310 static int intel_flags; 276 #define Transform(sha512) (*Transform_p)(sha512) 277 278 /* Dummy for saving MM_REGs on behalf of Transform */ 279 /* #if defined(HAVE_INTEL_AVX2) 280 #define SAVE_XMM_YMM __asm__ volatile("orq %%r8, %%r8":::\ 281 "%ymm0","%ymm1","%ymm2","%ymm3","%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11",\ 282 "%ymm12","%ymm13","%ymm14","%ymm15") 283 */ 284 #if defined(HAVE_INTEL_AVX1) 285 #define SAVE_XMM_YMM __asm__ volatile("orq %%r8, %%r8":::\ 286 "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15") 287 #endif 311 #define Transform_Sha512(sha512) (*Transform_Sha512_p)(sha512) 312 #define Transform_Sha512_Len(sha512, len) \ 313 (*Transform_Sha512_Len_p)(sha512, len) 288 314 289 315 static void Sha512_SetTransform() … … 295 321 296 322 #if defined(HAVE_INTEL_AVX2) 297 if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_BMI2(intel_flags)) { 298 if (1) 299 Transform_p = Transform_AVX1_RORX; 323 if (IS_INTEL_AVX2(intel_flags)) { 324 #ifdef HAVE_INTEL_RORX 325 if (IS_INTEL_BMI2(intel_flags)) { 326 Transform_Sha512_p = Transform_Sha512_AVX2_RORX; 327 Transform_Sha512_Len_p = Transform_Sha512_AVX2_RORX_Len; 328 } 300 329 else 301 Transform_p = Transform_AVX2; 330 #endif 331 if (1) { 332 Transform_Sha512_p = Transform_Sha512_AVX2; 333 Transform_Sha512_Len_p = Transform_Sha512_AVX2_Len; 334 } 335 #ifdef HAVE_INTEL_RORX 336 else { 337 Transform_Sha512_p = Transform_Sha512_AVX1_RORX; 338 Transform_Sha512_Len_p = Transform_Sha512_AVX1_RORX_Len; 339 } 340 #endif 302 341 } 303 342 else 304 343 #endif 305 344 #if defined(HAVE_INTEL_AVX1) 306 if ( 1) {307 Transform_ p = ((IS_INTEL_AVX1(intel_flags)) ? Transform_AVX1 :308 _Transform);345 if (IS_INTEL_AVX1(intel_flags)) { 346 Transform_Sha512_p = Transform_Sha512_AVX1; 347 Transform_Sha512_Len_p = Transform_Sha512_AVX1_Len; 309 348 } 310 349 else 311 350 #endif 312 Transform_ p = _Transform;351 Transform_Sha512_p = _Transform_Sha512; 313 352 314 353 transform_check = 1; 315 354 } 316 317 int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId) 318 { 319 int ret = InitSha512(sha512); 320 321 (void)heap; 322 (void)devId; 323 324 Sha512_SetTransform(); 325 326 return ret; 327 } 355 #endif /* WOLFSSL_SHA512 */ 328 356 329 357 #else 330 #define Transform(sha512) _Transform(sha512) 358 #define Transform_Sha512(sha512) _Transform_Sha512(sha512) 359 360 #endif 361 362 #ifdef WOLFSSL_SHA512 331 363 332 364 int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId) … … 342 374 if (ret != 0) 343 375 return ret; 376 377 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 378 Sha512_SetTransform(); 379 #endif 380 381 #ifdef WOLFSSL_SMALL_STACK_CACHE 382 sha512->W = NULL; 383 #endif 344 384 345 385 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512) … … 353 393 } 354 394 355 #endif /* Hardware Acceleration */ 356 357 #ifndef SAVE_XMM_YMM 358 #define SAVE_XMM_YMM 359 #endif 395 #endif /* WOLFSSL_SHA512 */ 396 360 397 361 398 static const word64 K512[80] = { … … 402 439 }; 403 440 404 405 406 441 #define blk0(i) (W[i] = sha512->buffer[i]) 407 442 408 #define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15])) 443 #define blk2(i) (\ 444 W[ i & 15] += \ 445 s1(W[(i-2) & 15])+ \ 446 W[(i-7) & 15] + \ 447 s0(W[(i-15) & 15]) \ 448 ) 409 449 410 450 #define Ch(x,y,z) (z^(x&(y^z))) … … 425 465 #define s1(x) (rotrFixed64(x,19)^rotrFixed64(x,61)^(x>>6)) 426 466 427 #define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j]+(j?blk2(i):blk0(i));\ 428 d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) 429 430 static int _Transform(wc_Sha512* sha512) 467 #define R(i) \ 468 h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j ? blk2(i) : blk0(i)); \ 469 d(i) += h(i); \ 470 h(i) += S0(a(i)) + Maj(a(i),b(i),c(i)) 471 472 static int _Transform_Sha512(wc_Sha512* sha512) 431 473 { 432 474 const word64* K = K512; 433 434 475 word32 j; 435 476 word64 T[8]; 436 477 437 438 #ifdef WOLFSSL_SMALL_STACK 478 #ifdef WOLFSSL_SMALL_STACK_CACHE 479 word64* W = sha512->W; 480 if (W == NULL) { 481 W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, 482 DYNAMIC_TYPE_TMP_BUFFER); 483 if (W == NULL) 484 return MEMORY_E; 485 sha512->W = W; 486 } 487 #elif defined(WOLFSSL_SMALL_STACK) 439 488 word64* W; 440 489 W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER); … … 448 497 XMEMCPY(T, sha512->digest, sizeof(T)); 449 498 450 #ifdef USE_SLOW_SHA 2499 #ifdef USE_SLOW_SHA512 451 500 /* over twice as small, but 50% slower */ 452 501 /* 80 operations, not unrolled */ … … 465 514 R(12); R(13); R(14); R(15); 466 515 } 467 #endif /* USE_SLOW_SHA 2 */516 #endif /* USE_SLOW_SHA512 */ 468 517 469 518 /* Add the working vars back into digest */ 470 471 519 sha512->digest[0] += a(0); 472 520 sha512->digest[1] += b(0); … … 482 530 ForceZero(T, sizeof(T)); 483 531 484 #if def WOLFSSL_SMALL_STACK532 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE) 485 533 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); 486 534 #endif … … 490 538 491 539 492 static INLINE void AddLength(wc_Sha512* sha512, word32 len)540 static WC_INLINE void AddLength(wc_Sha512* sha512, word32 len) 493 541 { 494 542 word64 tmp = sha512->loLen; … … 497 545 } 498 546 499 static INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 len)547 static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 len) 500 548 { 501 549 int ret = 0; … … 507 555 return BUFFER_E; 508 556 509 SAVE_XMM_YMM; /* for Intel AVX */ 510 511 while (len) { 557 if (sha512->buffLen > 0) { 512 558 word32 add = min(len, WC_SHA512_BLOCK_SIZE - sha512->buffLen); 559 if (add > 0) { 513 560 XMEMCPY(&local[sha512->buffLen], data, add); 514 561 … … 516 563 data += add; 517 564 len -= add; 565 } 518 566 519 567 if (sha512->buffLen == WC_SHA512_BLOCK_SIZE) { … … 527 575 } 528 576 #endif 529 ret = Transform(sha512); 577 ret = Transform_Sha512(sha512); 578 if (ret == 0) { 579 AddLength(sha512, WC_SHA512_BLOCK_SIZE); 580 sha512->buffLen = 0; 581 } 582 else 583 len = 0; 584 } 585 } 586 587 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 588 if (Transform_Sha512_Len_p != NULL) { 589 word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1); 590 591 if (blocksLen > 0) { 592 AddLength(sha512, blocksLen); 593 sha512->data = data; 594 /* Byte reversal performed in function if required. */ 595 Transform_Sha512_Len(sha512, blocksLen); 596 data += blocksLen; 597 len -= blocksLen; 598 } 599 } 600 else 601 #endif 602 #if !defined(LITTLE_ENDIAN_ORDER) || defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 603 { 604 word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1); 605 606 AddLength(sha512, blocksLen); 607 while (len >= WC_SHA512_BLOCK_SIZE) { 608 XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE); 609 610 data += WC_SHA512_BLOCK_SIZE; 611 len -= WC_SHA512_BLOCK_SIZE; 612 613 /* Byte reversal performed in function if required. */ 614 ret = Transform_Sha512(sha512); 530 615 if (ret != 0) 531 616 break; 532 533 AddLength(sha512, WC_SHA512_BLOCK_SIZE);534 sha512->buffLen = 0;535 617 } 618 } 619 #else 620 { 621 word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1); 622 623 AddLength(sha512, blocksLen); 624 while (len >= WC_SHA512_BLOCK_SIZE) { 625 XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE); 626 627 data += WC_SHA512_BLOCK_SIZE; 628 len -= WC_SHA512_BLOCK_SIZE; 629 630 ByteReverseWords64(sha512->buffer, sha512->buffer, 631 WC_SHA512_BLOCK_SIZE); 632 ret = Transform_Sha512(sha512); 633 if (ret != 0) 634 break; 635 } 636 } 637 #endif 638 639 if (len > 0) { 640 XMEMCPY(local, data, len); 641 sha512->buffLen = len; 536 642 } 537 643 538 644 return ret; 539 645 } 646 647 #ifdef WOLFSSL_SHA512 540 648 541 649 int wc_Sha512Update(wc_Sha512* sha512, const byte* data, word32 len) … … 556 664 } 557 665 558 559 static INLINE int Sha512Final(wc_Sha512* sha512) 666 #endif /* WOLFSSL_SHA512 */ 667 668 #endif /* WOLFSSL_IMX6_CAAM */ 669 670 static WC_INLINE int Sha512Final(wc_Sha512* sha512) 560 671 { 561 672 byte* local = (byte*)sha512->buffer; … … 566 677 } 567 678 568 SAVE_XMM_YMM ; /* for Intel AVX */569 679 AddLength(sha512, sha512->buffLen); /* before adding pads */ 570 680 … … 584 694 } 585 695 #endif /* LITTLE_ENDIAN_ORDER */ 586 ret = Transform (sha512);696 ret = Transform_Sha512(sha512); 587 697 if (ret != 0) 588 698 return ret; … … 614 724 WC_SHA512_BLOCK_SIZE - WC_SHA512_PAD_SIZE); 615 725 #endif 616 ret = Transform (sha512);726 ret = Transform_Sha512(sha512); 617 727 if (ret != 0) 618 728 return ret; … … 621 731 ByteReverseWords64(sha512->digest, sha512->digest, WC_SHA512_DIGEST_SIZE); 622 732 #endif 733 734 return 0; 735 } 736 737 #ifdef WOLFSSL_SHA512 738 739 int wc_Sha512FinalRaw(wc_Sha512* sha512, byte* hash) 740 { 741 #ifdef LITTLE_ENDIAN_ORDER 742 word64 digest[WC_SHA512_DIGEST_SIZE / sizeof(word64)]; 743 #endif 744 745 if (sha512 == NULL || hash == NULL) { 746 return BAD_FUNC_ARG; 747 } 748 749 #ifdef LITTLE_ENDIAN_ORDER 750 ByteReverseWords64((word64*)digest, (word64*)sha512->digest, 751 WC_SHA512_DIGEST_SIZE); 752 XMEMCPY(hash, digest, WC_SHA512_DIGEST_SIZE); 753 #else 754 XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE); 755 #endif 623 756 624 757 return 0; … … 662 795 return; 663 796 797 #ifdef WOLFSSL_SMALL_STACK_CACHE 798 if (sha512->W != NULL) { 799 XFREE(sha512->W, NULL, DYNAMIC_TYPE_TMP_BUFFER); 800 sha512->W = NULL; 801 } 802 #endif 803 664 804 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512) 665 805 wolfAsync_DevCtxFree(&sha512->asyncDev, WOLFSSL_ASYNC_MARKER_SHA512); … … 670 810 #if defined(HAVE_INTEL_AVX1) 671 811 672 #define Rx_1(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + W_X[i]; 673 #define Rx_2(i) d(i)+=h(i); 674 #define Rx_3(i) h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)); 812 static word64 mBYTE_FLIP_MASK[] = { 0x0001020304050607, 0x08090a0b0c0d0e0f }; 813 814 #define W_0 xmm0 815 #define W_2 xmm1 816 #define W_4 xmm2 817 #define W_6 xmm3 818 #define W_8 xmm4 819 #define W_10 xmm5 820 #define W_12 xmm6 821 #define W_14 xmm7 822 823 #define W_M15 xmm12 824 #define W_M7 xmm13 825 #define MASK xmm14 826 827 #define XTMP1 xmm8 828 #define XTMP2 xmm9 829 #define XTMP3 xmm10 830 #define XTMP4 xmm11 831 832 #define XMM_REGS \ 833 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \ 834 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" 835 836 #define _VPALIGNR(dest, src1, src2, bits) \ 837 "vpalignr $" #bits ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t" 838 #define VPALIGNR(dest, src1, src2, bits) \ 839 _VPALIGNR(dest, src1, src2, bits) 840 841 #define _V_SHIFT_R(dest, src, bits) \ 842 "vpsrlq $" #bits ", %%" #src ", %%" #dest "\n\t" 843 #define V_SHIFT_R(dest, src, bits) \ 844 _V_SHIFT_R(dest, src, bits) 845 846 #define _V_SHIFT_L(dest, src, bits) \ 847 "vpsllq $" #bits ", %%" #src ", %%" #dest "\n\t" 848 #define V_SHIFT_L(dest, src, bits) \ 849 _V_SHIFT_L(dest, src, bits) 850 851 #define _V_ADD(dest, src1, src2) \ 852 "vpaddq %%" #src1 ", %%" #src2 ", %%" #dest "\n\t" 853 #define V_ADD(dest, src1, src2) \ 854 _V_ADD(dest, src1, src2) 855 856 #define _V_XOR(dest, src1, src2) \ 857 "vpxor %%" #src1 ", %%" #src2 ", %%" #dest "\n\t" 858 #define V_XOR(dest, src1, src2) \ 859 _V_XOR(dest, src1, src2) 860 861 #define _V_OR(dest, src1, src2) \ 862 "vpor %%" #src1 ", %%" #src2 ", %%" #dest "\n\t" 863 #define V_OR(dest, src1, src2) \ 864 _V_OR(dest, src1, src2) 865 866 #define RA %%r8 867 #define RB %%r9 868 #define RC %%r10 869 #define RD %%r11 870 #define RE %%r12 871 #define RF %%r13 872 #define RG %%r14 873 #define RH %%r15 874 875 #define STATE_REGS "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 876 877 #define L1 "%%rax" 878 #define L2 "%%rcx" 879 #define L3 "%%rdx" 880 #define L4 "%%rbx" 881 #define WX "%%rsp" 882 883 #define WORK_REGS "rax", "rbx", "rcx", "rdx" 884 885 #define RND_0_1(a,b,c,d,e,f,g,h,i) \ 886 /* L1 = e >>> 23 */ \ 887 "rorq $23, " L1 "\n\t" \ 888 889 #define RND_0_2(a,b,c,d,e,f,g,h,i) \ 890 /* L3 = a */ \ 891 "movq "#a", " L3 "\n\t" \ 892 /* L2 = f */ \ 893 "movq "#f", " L2 "\n\t" \ 894 /* h += W_X[i] */ \ 895 "addq ("#i")*8(" WX "), "#h"\n\t" \ 896 /* L2 = f ^ g */ \ 897 "xorq "#g", " L2 "\n\t" \ 898 899 #define RND_0_2_A(a,b,c,d,e,f,g,h,i) \ 900 /* L3 = a */ \ 901 "movq "#a", " L3 "\n\t" \ 902 /* L2 = f */ \ 903 "movq "#f", " L2 "\n\t" \ 904 905 #define RND_0_2_B(a,b,c,d,e,f,g,h,i) \ 906 /* h += W_X[i] */ \ 907 "addq ("#i")*8(" WX "), "#h"\n\t" \ 908 /* L2 = f ^ g */ \ 909 "xorq "#g", " L2 "\n\t" \ 910 911 #define RND_0_3(a,b,c,d,e,f,g,h,i) \ 912 /* L1 = (e >>> 23) ^ e */ \ 913 "xorq "#e", " L1 "\n\t" \ 914 /* L2 = (f ^ g) & e */ \ 915 "andq "#e", " L2 "\n\t" \ 916 917 #define RND_0_4(a,b,c,d,e,f,g,h,i) \ 918 /* L1 = ((e >>> 23) ^ e) >>> 4 */ \ 919 "rorq $4, " L1 "\n\t" \ 920 /* L2 = ((f ^ g) & e) ^ g */ \ 921 "xorq "#g", " L2 "\n\t" \ 922 923 #define RND_0_5(a,b,c,d,e,f,g,h,i) \ 924 /* L1 = (((e >>> 23) ^ e) >>> 4) ^ e */ \ 925 "xorq "#e", " L1 "\n\t" \ 926 /* h += Ch(e,f,g) */ \ 927 "addq " L2 ", "#h"\n\t" \ 928 929 #define RND_0_6(a,b,c,d,e,f,g,h,i) \ 930 /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \ 931 "rorq $14, " L1 "\n\t" \ 932 /* L3 = a ^ b */ \ 933 "xorq "#b", " L3 "\n\t" \ 934 935 #define RND_0_7(a,b,c,d,e,f,g,h,i) \ 936 /* h += Sigma1(e) */ \ 937 "addq " L1 ", "#h"\n\t" \ 938 /* L2 = a */ \ 939 "movq "#a", " L2 "\n\t" \ 940 941 #define RND_0_8(a,b,c,d,e,f,g,h,i) \ 942 /* L4 = (a ^ b) & (b ^ c) */ \ 943 "andq " L3 ", " L4 "\n\t" \ 944 /* L2 = a >>> 5 */ \ 945 "rorq $5, " L2 "\n\t" \ 946 947 #define RND_0_9(a,b,c,d,e,f,g,h,i) \ 948 /* L2 = (a >>> 5) ^ a */ \ 949 "xorq "#a", " L2 "\n\t" \ 950 /* L4 = ((a ^ b) & (b ^ c) ^ b */ \ 951 "xorq "#b", " L4 "\n\t" \ 952 953 #define RND_0_10(a,b,c,d,e,f,g,h,i) \ 954 /* L2 = ((a >>> 5) ^ a) >>> 6 */ \ 955 "rorq $6, " L2 "\n\t" \ 956 /* d += h */ \ 957 "addq "#h", "#d"\n\t" \ 958 959 #define RND_0_11(a,b,c,d,e,f,g,h,i) \ 960 /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */ \ 961 "xorq "#a", " L2 "\n\t" \ 962 /* h += Sigma0(a) */ \ 963 "addq " L4 ", "#h"\n\t" \ 964 965 #define RND_0_12(a,b,c,d,e,f,g,h,i) \ 966 /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */ \ 967 "rorq $28, " L2 "\n\t" \ 968 /* d (= e next RND) */ \ 969 "movq "#d", " L1 "\n\t" \ 970 /* h += Maj(a,b,c) */ \ 971 "addq " L2 ", "#h"\n\t" \ 972 973 #define RND_1_1(a,b,c,d,e,f,g,h,i) \ 974 /* L1 = e >>> 23 */ \ 975 "rorq $23, " L1 "\n\t" \ 976 977 #define RND_1_2(a,b,c,d,e,f,g,h,i) \ 978 /* L4 = a */ \ 979 "movq "#a", " L4 "\n\t" \ 980 /* L2 = f */ \ 981 "movq "#f", " L2 "\n\t" \ 982 /* h += W_X[i] */ \ 983 "addq ("#i")*8(" WX "), "#h"\n\t" \ 984 /* L2 = f ^ g */ \ 985 "xorq "#g", " L2 "\n\t" \ 986 987 #define RND_1_2_A(a,b,c,d,e,f,g,h,i) \ 988 /* L4 = a */ \ 989 "movq "#a", " L4 "\n\t" \ 990 /* L2 = f */ \ 991 "movq "#f", " L2 "\n\t" \ 992 993 #define RND_1_2_B(a,b,c,d,e,f,g,h,i) \ 994 /* h += W_X[i] */ \ 995 "addq ("#i")*8(" WX "), "#h"\n\t" \ 996 /* L2 = f ^ g */ \ 997 "xorq "#g", " L2 "\n\t" \ 998 999 #define RND_1_3(a,b,c,d,e,f,g,h,i) \ 1000 /* L1 = (e >>> 23) ^ e */ \ 1001 "xorq "#e", " L1 "\n\t" \ 1002 /* L2 = (f ^ g) & e */ \ 1003 "andq "#e", " L2 "\n\t" \ 1004 1005 #define RND_1_4(a,b,c,d,e,f,g,h,i) \ 1006 /* ((e >>> 23) ^ e) >>> 4 */ \ 1007 "rorq $4, " L1 "\n\t" \ 1008 /* ((f ^ g) & e) ^ g */ \ 1009 "xorq "#g", " L2 "\n\t" \ 1010 1011 #define RND_1_5(a,b,c,d,e,f,g,h,i) \ 1012 /* (((e >>> 23) ^ e) >>> 4) ^ e */ \ 1013 "xorq "#e", " L1 "\n\t" \ 1014 /* h += Ch(e,f,g) */ \ 1015 "addq " L2 ", "#h"\n\t" \ 1016 1017 #define RND_1_6(a,b,c,d,e,f,g,h,i) \ 1018 /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \ 1019 "rorq $14, " L1 "\n\t" \ 1020 /* L4 = a ^ b */ \ 1021 "xorq "#b", " L4 "\n\t" \ 1022 1023 #define RND_1_7(a,b,c,d,e,f,g,h,i) \ 1024 /* h += Sigma1(e) */ \ 1025 "addq " L1 ", "#h"\n\t" \ 1026 /* L2 = a */ \ 1027 "movq "#a", " L2 "\n\t" \ 1028 1029 #define RND_1_8(a,b,c,d,e,f,g,h,i) \ 1030 /* L3 = (a ^ b) & (b ^ c) */ \ 1031 "andq " L4 ", " L3 "\n\t" \ 1032 /* L2 = a >>> 5 */ \ 1033 "rorq $5, " L2 "\n\t" \ 1034 1035 #define RND_1_9(a,b,c,d,e,f,g,h,i) \ 1036 /* L2 = (a >>> 5) ^ a */ \ 1037 "xorq "#a", " L2 "\n\t" \ 1038 /* L3 = ((a ^ b) & (b ^ c) ^ b */ \ 1039 "xorq "#b", " L3 "\n\t" \ 1040 1041 #define RND_1_10(a,b,c,d,e,f,g,h,i) \ 1042 /* L2 = ((a >>> 5) ^ a) >>> 6 */ \ 1043 "rorq $6, " L2 "\n\t" \ 1044 /* d += h */ \ 1045 "addq "#h", "#d"\n\t" \ 1046 1047 #define RND_1_11(a,b,c,d,e,f,g,h,i) \ 1048 /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */ \ 1049 "xorq "#a", " L2 "\n\t" \ 1050 /* h += Sigma0(a) */ \ 1051 "addq " L3 ", "#h"\n\t" \ 1052 1053 #define RND_1_12(a,b,c,d,e,f,g,h,i) \ 1054 /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */ \ 1055 "rorq $28, " L2 "\n\t" \ 1056 /* d (= e next RND) */ \ 1057 "movq "#d", " L1 "\n\t" \ 1058 /* h += Maj(a,b,c) */ \ 1059 "addq " L2 ", "#h"\n\t" \ 1060 1061 1062 #define MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \ 1063 RND_0_1(a,b,c,d,e,f,g,h,i) \ 1064 VPALIGNR(W_M15, W_2, W_0, 8) \ 1065 VPALIGNR(W_M7, W_10, W_8, 8) \ 1066 RND_0_2(a,b,c,d,e,f,g,h,i) \ 1067 V_SHIFT_R(XTMP1, W_M15, 1) \ 1068 V_SHIFT_L(XTMP2, W_M15, 63) \ 1069 RND_0_3(a,b,c,d,e,f,g,h,i) \ 1070 RND_0_4(a,b,c,d,e,f,g,h,i) \ 1071 V_SHIFT_R(XTMP3, W_M15, 8) \ 1072 V_SHIFT_L(XTMP4, W_M15, 56) \ 1073 RND_0_5(a,b,c,d,e,f,g,h,i) \ 1074 RND_0_6(a,b,c,d,e,f,g,h,i) \ 1075 V_OR(XTMP1, XTMP2, XTMP1) \ 1076 V_OR(XTMP3, XTMP4, XTMP3) \ 1077 RND_0_7(a,b,c,d,e,f,g,h,i) \ 1078 RND_0_8(a,b,c,d,e,f,g,h,i) \ 1079 V_SHIFT_R(XTMP4, W_M15, 7) \ 1080 V_XOR(XTMP1, XTMP3, XTMP1) \ 1081 RND_0_9(a,b,c,d,e,f,g,h,i) \ 1082 RND_0_10(a,b,c,d,e,f,g,h,i) \ 1083 V_XOR(XTMP1, XTMP4, XTMP1) \ 1084 V_ADD(W_0, W_0, W_M7) \ 1085 RND_0_11(a,b,c,d,e,f,g,h,i) \ 1086 RND_0_12(a,b,c,d,e,f,g,h,i) \ 1087 RND_1_1(h,a,b,c,d,e,f,g,i+1) \ 1088 V_ADD(W_0, W_0, XTMP1) \ 1089 RND_1_2(h,a,b,c,d,e,f,g,i+1) \ 1090 V_SHIFT_R(XTMP1, W_14, 19) \ 1091 V_SHIFT_L(XTMP2, W_14, 45) \ 1092 RND_1_3(h,a,b,c,d,e,f,g,i+1) \ 1093 RND_1_4(h,a,b,c,d,e,f,g,i+1) \ 1094 V_SHIFT_R(XTMP3, W_14, 61) \ 1095 V_SHIFT_L(XTMP4, W_14, 3) \ 1096 RND_1_5(h,a,b,c,d,e,f,g,i+1) \ 1097 RND_1_6(h,a,b,c,d,e,f,g,i+1) \ 1098 RND_1_7(h,a,b,c,d,e,f,g,i+1) \ 1099 V_OR(XTMP1, XTMP2, XTMP1) \ 1100 V_OR(XTMP3, XTMP4, XTMP3) \ 1101 RND_1_8(h,a,b,c,d,e,f,g,i+1) \ 1102 RND_1_9(h,a,b,c,d,e,f,g,i+1) \ 1103 V_XOR(XTMP1, XTMP3, XTMP1) \ 1104 V_SHIFT_R(XTMP4, W_14, 6) \ 1105 RND_1_10(h,a,b,c,d,e,f,g,i+1) \ 1106 RND_1_11(h,a,b,c,d,e,f,g,i+1) \ 1107 V_XOR(XTMP1, XTMP4, XTMP1) \ 1108 RND_1_12(h,a,b,c,d,e,f,g,i+1) \ 1109 V_ADD(W_0, W_0, XTMP1) \ 1110 1111 #define RND_ALL_2(a, b, c, d, e, f, g, h, i) \ 1112 RND_0_1 (a, b, c, d, e, f, g, h, i ) \ 1113 RND_0_2 (a, b, c, d, e, f, g, h, i ) \ 1114 RND_0_3 (a, b, c, d, e, f, g, h, i ) \ 1115 RND_0_4 (a, b, c, d, e, f, g, h, i ) \ 1116 RND_0_5 (a, b, c, d, e, f, g, h, i ) \ 1117 RND_0_6 (a, b, c, d, e, f, g, h, i ) \ 1118 RND_0_7 (a, b, c, d, e, f, g, h, i ) \ 1119 RND_0_8 (a, b, c, d, e, f, g, h, i ) \ 1120 RND_0_9 (a, b, c, d, e, f, g, h, i ) \ 1121 RND_0_10(a, b, c, d, e, f, g, h, i ) \ 1122 RND_0_11(a, b, c, d, e, f, g, h, i ) \ 1123 RND_0_12(a, b, c, d, e, f, g, h, i ) \ 1124 RND_1_1 (h, a, b, c, d, e, f, g, i+1) \ 1125 RND_1_2 (h, a, b, c, d, e, f, g, i+1) \ 1126 RND_1_3 (h, a, b, c, d, e, f, g, i+1) \ 1127 RND_1_4 (h, a, b, c, d, e, f, g, i+1) \ 1128 RND_1_5 (h, a, b, c, d, e, f, g, i+1) \ 1129 RND_1_6 (h, a, b, c, d, e, f, g, i+1) \ 1130 RND_1_7 (h, a, b, c, d, e, f, g, i+1) \ 1131 RND_1_8 (h, a, b, c, d, e, f, g, i+1) \ 1132 RND_1_9 (h, a, b, c, d, e, f, g, i+1) \ 1133 RND_1_10(h, a, b, c, d, e, f, g, i+1) \ 1134 RND_1_11(h, a, b, c, d, e, f, g, i+1) \ 1135 RND_1_12(h, a, b, c, d, e, f, g, i+1) 1136 675 1137 676 1138 #if defined(HAVE_INTEL_RORX) 677 1139 678 #define Rx_RORX_1(i) h(i)+=S1_RORX(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + W_X[i]; 679 #define Rx_RORX_2(i) d(i)+=h(i); 680 #define Rx_RORX_3(i) h(i)+=S0_RORX(a(i))+Maj(a(i),b(i),c(i)); 681 #endif /* HAVE_INTEL_RORX */ 1140 #define RND_RORX_0_1(a, b, c, d, e, f, g, h, i) \ 1141 /* L1 = e>>>14 */ \ 1142 "rorxq $14, "#e", " L1 "\n\t" \ 1143 /* L2 = e>>>18 */ \ 1144 "rorxq $18, "#e", " L2 "\n\t" \ 1145 /* Prev RND: h += Maj(a,b,c) */ \ 1146 "addq " L3 ", "#a"\n\t" \ 1147 1148 #define RND_RORX_0_2(a, b, c, d, e, f, g, h, i) \ 1149 /* h += w_k */ \ 1150 "addq ("#i")*8(" WX "), "#h"\n\t" \ 1151 /* L3 = f */ \ 1152 "movq "#f", " L3 "\n\t" \ 1153 /* L2 = (e>>>14) ^ (e>>>18) */ \ 1154 "xorq " L1 ", " L2 "\n\t" \ 1155 1156 #define RND_RORX_0_3(a, b, c, d, e, f, g, h, i) \ 1157 /* L3 = f ^ g */ \ 1158 "xorq "#g", " L3 "\n\t" \ 1159 /* L1 = e>>>41 */ \ 1160 "rorxq $41, "#e", " L1 "\n\t" \ 1161 /* L1 = Sigma1(e) */ \ 1162 "xorq " L2 ", " L1 "\n\t" \ 1163 1164 #define RND_RORX_0_4(a, b, c, d, e, f, g, h, i) \ 1165 /* L3 = (f ^ g) & e */ \ 1166 "andq "#e", " L3 "\n\t" \ 1167 /* h += Sigma1(e) */ \ 1168 "addq " L1 ", "#h"\n\t" \ 1169 /* L1 = a>>>28 */ \ 1170 "rorxq $28, "#a", " L1 "\n\t" \ 1171 1172 #define RND_RORX_0_5(a, b, c, d, e, f, g, h, i) \ 1173 /* L2 = a>>>34 */ \ 1174 "rorxq $34, "#a", " L2 "\n\t" \ 1175 /* L3 = Ch(e,f,g) */ \ 1176 "xorq "#g", " L3 "\n\t" \ 1177 /* L2 = (a>>>28) ^ (a>>>34) */ \ 1178 "xorq " L1 ", " L2 "\n\t" \ 1179 1180 #define RND_RORX_0_6(a, b, c, d, e, f, g, h, i) \ 1181 /* L1 = a>>>39 */ \ 1182 "rorxq $39, "#a", " L1 "\n\t" \ 1183 /* h += Ch(e,f,g) */ \ 1184 "addq " L3 ", "#h"\n\t" \ 1185 /* L1 = Sigma0(a) */ \ 1186 "xorq " L2 ", " L1 "\n\t" \ 1187 1188 #define RND_RORX_0_7(a, b, c, d, e, f, g, h, i) \ 1189 /* L3 = b */ \ 1190 "movq "#b", " L3 "\n\t" \ 1191 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 1192 "addq "#h", "#d"\n\t" \ 1193 /* L3 = a ^ b */ \ 1194 "xorq "#a", " L3 "\n\t" \ 1195 1196 #define RND_RORX_0_8(a, b, c, d, e, f, g, h, i) \ 1197 /* L4 = (a ^ b) & (b ^ c) */ \ 1198 "andq " L3 ", " L4 "\n\t" \ 1199 /* h += Sigma0(a) */ \ 1200 "addq " L1 ", "#h"\n\t" \ 1201 /* L4 = Maj(a,b,c) */ \ 1202 "xorq "#b", " L4 "\n\t" \ 1203 1204 #define RND_RORX_1_1(a, b, c, d, e, f, g, h, i) \ 1205 /* L1 = e>>>14 */ \ 1206 "rorxq $14, "#e", " L1 "\n\t" \ 1207 /* L2 = e>>>18 */ \ 1208 "rorxq $18, "#e", " L2 "\n\t" \ 1209 /* Prev RND: h += Maj(a,b,c) */ \ 1210 "addq " L4 ", "#a"\n\t" \ 1211 1212 #define RND_RORX_1_2(a, b, c, d, e, f, g, h, i) \ 1213 /* h += w_k */ \ 1214 "addq ("#i")*8(" WX "), "#h"\n\t" \ 1215 /* L4 = f */ \ 1216 "movq "#f", " L4 "\n\t" \ 1217 /* L2 = (e>>>14) ^ (e>>>18) */ \ 1218 "xorq " L1 ", " L2 "\n\t" \ 1219 1220 #define RND_RORX_1_3(a, b, c, d, e, f, g, h, i) \ 1221 /* L4 = f ^ g */ \ 1222 "xorq "#g", " L4 "\n\t" \ 1223 /* L1 = e>>>41 */ \ 1224 "rorxq $41, "#e", " L1 "\n\t" \ 1225 /* L1 = Sigma1(e) */ \ 1226 "xorq " L2 ", " L1 "\n\t" \ 1227 1228 #define RND_RORX_1_4(a, b, c, d, e, f, g, h, i) \ 1229 /* L4 = (f ^ g) & e */ \ 1230 "andq "#e", " L4 "\n\t" \ 1231 /* h += Sigma1(e) */ \ 1232 "addq " L1 ", "#h"\n\t" \ 1233 /* L1 = a>>>28 */ \ 1234 "rorxq $28, "#a", " L1 "\n\t" \ 1235 1236 #define RND_RORX_1_5(a, b, c, d, e, f, g, h, i) \ 1237 /* L2 = a>>>34 */ \ 1238 "rorxq $34, "#a", " L2 "\n\t" \ 1239 /* L4 = Ch(e,f,g) */ \ 1240 "xorq "#g", " L4 "\n\t" \ 1241 /* L2 = (a>>>28) ^ (a>>>34) */ \ 1242 "xorq " L1 ", " L2 "\n\t" \ 1243 1244 #define RND_RORX_1_6(a, b, c, d, e, f, g, h, i) \ 1245 /* L1 = a>>>39 */ \ 1246 "rorxq $39, "#a", " L1 "\n\t" \ 1247 /* h += Ch(e,f,g) */ \ 1248 "addq " L4 ", "#h"\n\t" \ 1249 /* L1 = Sigma0(a) */ \ 1250 "xorq " L2 ", " L1 "\n\t" \ 1251 1252 #define RND_RORX_1_7(a, b, c, d, e, f, g, h, i) \ 1253 /* L4 = b */ \ 1254 "movq "#b", " L4 "\n\t" \ 1255 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 1256 "addq "#h", "#d"\n\t" \ 1257 /* L4 = a ^ b */ \ 1258 "xorq "#a", " L4 "\n\t" \ 1259 1260 #define RND_RORX_1_8(a, b, c, d, e, f, g, h, i) \ 1261 /* L2 = (a ^ b) & (b ^ c) */ \ 1262 "andq " L4 ", " L3 "\n\t" \ 1263 /* h += Sigma0(a) */ \ 1264 "addq " L1 ", "#h"\n\t" \ 1265 /* L3 = Maj(a,b,c) */ \ 1266 "xorq "#b", " L3 "\n\t" \ 1267 1268 #define RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i) \ 1269 RND_RORX_0_1(a, b, c, d, e, f, g, h, i+0) \ 1270 RND_RORX_0_2(a, b, c, d, e, f, g, h, i+0) \ 1271 RND_RORX_0_3(a, b, c, d, e, f, g, h, i+0) \ 1272 RND_RORX_0_4(a, b, c, d, e, f, g, h, i+0) \ 1273 RND_RORX_0_5(a, b, c, d, e, f, g, h, i+0) \ 1274 RND_RORX_0_6(a, b, c, d, e, f, g, h, i+0) \ 1275 RND_RORX_0_7(a, b, c, d, e, f, g, h, i+0) \ 1276 RND_RORX_0_8(a, b, c, d, e, f, g, h, i+0) \ 1277 RND_RORX_1_1(h, a, b, c, d, e, f, g, i+1) \ 1278 RND_RORX_1_2(h, a, b, c, d, e, f, g, i+1) \ 1279 RND_RORX_1_3(h, a, b, c, d, e, f, g, i+1) \ 1280 RND_RORX_1_4(h, a, b, c, d, e, f, g, i+1) \ 1281 RND_RORX_1_5(h, a, b, c, d, e, f, g, i+1) \ 1282 RND_RORX_1_6(h, a, b, c, d, e, f, g, i+1) \ 1283 RND_RORX_1_7(h, a, b, c, d, e, f, g, i+1) \ 1284 RND_RORX_1_8(h, a, b, c, d, e, f, g, i+1) \ 1285 1286 #define RND_RORX_ALL_4(a, b, c, d, e, f, g, h, i) \ 1287 RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i+0) \ 1288 RND_RORX_ALL_2(g, h, a, b, c, d, e, f, i+2) 1289 1290 #define MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \ 1291 RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \ 1292 VPALIGNR(W_M15, W_2, W_0, 8) \ 1293 VPALIGNR(W_M7, W_10, W_8, 8) \ 1294 RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \ 1295 V_SHIFT_R(XTMP1, W_M15, 1) \ 1296 V_SHIFT_L(XTMP2, W_M15, 63) \ 1297 RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \ 1298 V_SHIFT_R(XTMP3, W_M15, 8) \ 1299 V_SHIFT_L(XTMP4, W_M15, 56) \ 1300 RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \ 1301 V_OR(XTMP1, XTMP2, XTMP1) \ 1302 V_OR(XTMP3, XTMP4, XTMP3) \ 1303 RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \ 1304 V_SHIFT_R(XTMP4, W_M15, 7) \ 1305 V_XOR(XTMP1, XTMP3, XTMP1) \ 1306 RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \ 1307 V_XOR(XTMP1, XTMP4, XTMP1) \ 1308 V_ADD(W_0, W_0, W_M7) \ 1309 RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \ 1310 RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \ 1311 V_ADD(W_0, W_0, XTMP1) \ 1312 RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \ 1313 V_SHIFT_R(XTMP1, W_14, 19) \ 1314 V_SHIFT_L(XTMP2, W_14, 45) \ 1315 RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \ 1316 V_SHIFT_R(XTMP3, W_14, 61) \ 1317 V_SHIFT_L(XTMP4, W_14, 3) \ 1318 RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \ 1319 V_OR(XTMP1, XTMP2, XTMP1) \ 1320 V_OR(XTMP3, XTMP4, XTMP3) \ 1321 RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \ 1322 RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \ 1323 V_XOR(XTMP1, XTMP3, XTMP1) \ 1324 V_SHIFT_R(XTMP4, W_14, 6) \ 1325 RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \ 1326 RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \ 1327 V_XOR(XTMP1, XTMP4, XTMP1) \ 1328 RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \ 1329 V_ADD(W_0, W_0, XTMP1) \ 1330 1331 #endif 1332 1333 #define _INIT_MASK(mask) \ 1334 "vmovdqu %[mask], %%" #mask "\n\t" 1335 #define INIT_MASK(mask) \ 1336 _INIT_MASK(mask) 1337 1338 #define _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) \ 1339 "vmovdqu " #i1 "*16(%%" #reg "), %%" #xmm1 "\n\t" \ 1340 "vmovdqu " #i2 "*16(%%" #reg "), %%" #xmm2 "\n\t" \ 1341 "vpshufb %%" #mask ", %%" #xmm1 ", %%" #xmm1 "\n\t" \ 1342 "vpshufb %%" #mask ", %%" #xmm2 ", %%" #xmm2 "\n\t" 1343 #define LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) \ 1344 _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) 1345 1346 #define LOAD_W(mask, reg) \ 1347 /* X0..3(xmm4..7), W[0..15] = buffer[0.15]; */ \ 1348 LOAD_W_2(0, 1, W_0 , W_2 , mask, reg) \ 1349 LOAD_W_2(2, 3, W_4 , W_6 , mask, reg) \ 1350 LOAD_W_2(4, 5, W_8 , W_10, mask, reg) \ 1351 LOAD_W_2(6, 7, W_12, W_14, mask, reg) 1352 1353 #define _SET_W_X_2(xmm0, xmm1, reg, i) \ 1354 "vpaddq " #i "+ 0(%%" #reg "), %%" #xmm0 ", %%xmm8\n\t" \ 1355 "vpaddq " #i "+16(%%" #reg "), %%" #xmm1 ", %%xmm9\n\t" \ 1356 "vmovdqu %%xmm8, " #i "+ 0(" WX ")\n\t" \ 1357 "vmovdqu %%xmm9, " #i "+16(" WX ")\n\t" \ 1358 1359 #define SET_W_X_2(xmm0, xmm1, reg, i) \ 1360 _SET_W_X_2(xmm0, xmm1, reg, i) 1361 1362 #define SET_W_X(reg) \ 1363 SET_W_X_2(W_0 , W_2 , reg, 0) \ 1364 SET_W_X_2(W_4 , W_6 , reg, 32) \ 1365 SET_W_X_2(W_8 , W_10, reg, 64) \ 1366 SET_W_X_2(W_12, W_14, reg, 96) 1367 1368 #define LOAD_DIGEST() \ 1369 "movq (%[sha512]), %%r8 \n\t" \ 1370 "movq 8(%[sha512]), %%r9 \n\t" \ 1371 "movq 16(%[sha512]), %%r10\n\t" \ 1372 "movq 24(%[sha512]), %%r11\n\t" \ 1373 "movq 32(%[sha512]), %%r12\n\t" \ 1374 "movq 40(%[sha512]), %%r13\n\t" \ 1375 "movq 48(%[sha512]), %%r14\n\t" \ 1376 "movq 56(%[sha512]), %%r15\n\t" 1377 1378 #define STORE_ADD_DIGEST() \ 1379 "addq %%r8, (%[sha512])\n\t" \ 1380 "addq %%r9, 8(%[sha512])\n\t" \ 1381 "addq %%r10, 16(%[sha512])\n\t" \ 1382 "addq %%r11, 24(%[sha512])\n\t" \ 1383 "addq %%r12, 32(%[sha512])\n\t" \ 1384 "addq %%r13, 40(%[sha512])\n\t" \ 1385 "addq %%r14, 48(%[sha512])\n\t" \ 1386 "addq %%r15, 56(%[sha512])\n\t" 1387 1388 #define ADD_DIGEST() \ 1389 "addq (%[sha512]), %%r8 \n\t" \ 1390 "addq 8(%[sha512]), %%r9 \n\t" \ 1391 "addq 16(%[sha512]), %%r10\n\t" \ 1392 "addq 24(%[sha512]), %%r11\n\t" \ 1393 "addq 32(%[sha512]), %%r12\n\t" \ 1394 "addq 40(%[sha512]), %%r13\n\t" \ 1395 "addq 48(%[sha512]), %%r14\n\t" \ 1396 "addq 56(%[sha512]), %%r15\n\t" 1397 1398 #define STORE_DIGEST() \ 1399 "movq %%r8, (%[sha512])\n\t" \ 1400 "movq %%r9, 8(%[sha512])\n\t" \ 1401 "movq %%r10, 16(%[sha512])\n\t" \ 1402 "movq %%r11, 24(%[sha512])\n\t" \ 1403 "movq %%r12, 32(%[sha512])\n\t" \ 1404 "movq %%r13, 40(%[sha512])\n\t" \ 1405 "movq %%r14, 48(%[sha512])\n\t" \ 1406 "movq %%r15, 56(%[sha512])\n\t" 682 1407 683 1408 #endif /* HAVE_INTEL_AVX1 */ 684 685 #if defined(HAVE_INTEL_AVX2)686 #define Ry_1(i, w) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + w;687 #define Ry_2(i, w) d(i)+=h(i);688 #define Ry_3(i, w) h(i)+=S0(a(i))+Maj(a(i),b(i),c(i));689 #endif /* HAVE_INTEL_AVX2 */690 691 /* INLINE Assember for Intel AVX1 instructions */692 #if defined(HAVE_INTEL_AVX1)693 #if defined(DEBUG_XMM)694 #define SAVE_REG(i) __asm__ volatile("vmovdqu %%xmm"#i", %0 \n\t":"=m"(reg[i][0]):);695 #define RECV_REG(i) __asm__ volatile("vmovdqu %0, %%xmm"#i" \n\t"::"m"(reg[i][0]));696 697 #define _DUMP_REG(REG, name)\698 { word64 buf[16];word64 reg[16][2];int k;\699 SAVE_REG(0); SAVE_REG(1); SAVE_REG(2); SAVE_REG(3); SAVE_REG(4); \700 SAVE_REG(5); SAVE_REG(6); SAVE_REG(7);SAVE_REG(8); SAVE_REG(9); SAVE_REG(10);\701 SAVE_REG(11); SAVE_REG(12); SAVE_REG(13); SAVE_REG(14); SAVE_REG(15); \702 __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0]):);\703 printf(" "#name":\t"); for(k=0; k<2; k++) printf("%016lx.", (word64)(buf[k])); printf("\n"); \704 RECV_REG(0); RECV_REG(1); RECV_REG(2); RECV_REG(3); RECV_REG(4);\705 RECV_REG(5); RECV_REG(6); RECV_REG(7); RECV_REG(8); RECV_REG(9);\706 RECV_REG(10); RECV_REG(11); RECV_REG(12); RECV_REG(13); RECV_REG(14); RECV_REG(15);\707 }708 709 #define DUMP_REG(REG) _DUMP_REG(REG, #REG)710 #define PRINTF(fmt, ...)711 #else712 #define DUMP_REG(REG)713 #define PRINTF(fmt, ...)714 #endif /* DEBUG_XMM */715 716 #define _MOVE_to_REG(xymm, mem) __asm__ volatile("vmovdqu %0, %%"#xymm" "\717 :: "m"(mem));718 #define _MOVE_to_MEM(mem,i, xymm) __asm__ volatile("vmovdqu %%"#xymm", %0" :\719 "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3]):);720 #define _MOVE(dest, src) __asm__ volatile("vmovdqu %%"#src", %%"\721 #dest" "::);722 723 #define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrlq $"#bits", %%"\724 #src", %%"#dest"\n\tvpsllq $64-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\725 #temp",%%"#dest", %%"#dest" "::);726 #define _AVX1_R(dest, src, bits) __asm__ volatile("vpsrlq $"#bits", %%"\727 #src", %%"#dest" "::);728 #define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\729 #src2", %%"#dest" "::);730 #define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\731 #src2", %%"#dest" "::);732 #define _ADD(dest, src1, src2) __asm__ volatile("vpaddq %%"#src1", %%"\733 #src2", %%"#dest" "::);734 #define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddq %0, %%"#src1", %%"\735 #dest" "::"m"(mem));736 737 #define MOVE_to_REG(xymm, mem) _MOVE_to_REG(xymm, mem)738 #define MOVE_to_MEM(mem, i, xymm) _MOVE_to_MEM(mem, i, xymm)739 #define MOVE(dest, src) _MOVE(dest, src)740 741 #define XOR(dest, src1, src2) _XOR(dest, src1, src2)742 #define OR(dest, src1, src2) _OR(dest, src1, src2)743 #define ADD(dest, src1, src2) _ADD(dest, src1, src2)744 745 #define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp);746 #define AVX1_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP)747 #define AVX1_R(dest, src, bits) _AVX1_R(dest, src, bits)748 749 #define Init_Mask(mask) \750 __asm__ volatile("vmovdqu %0, %%xmm1\n\t"::"m"(mask):"%xmm1");751 752 #define _W_from_buff1(w, buff, xmm) \753 /* X0..3(xmm4..7), W[0..15] = sha512->buffer[0.15]; */\754 __asm__ volatile("vmovdqu %1, %%"#xmm"\n\t"\755 "vpshufb %%xmm1, %%"#xmm", %%"#xmm"\n\t"\756 "vmovdqu %%"#xmm", %0"\757 :"=m"(w): "m"(buff):"%xmm0");758 759 #define W_from_buff1(w, buff, xmm) _W_from_buff1(w, buff, xmm)760 761 #define W_from_buff(w, buff)\762 Init_Mask(mBYTE_FLIP_MASK[0]);\763 W_from_buff1(w[0], buff[0], W_0);\764 W_from_buff1(w[2], buff[2], W_2);\765 W_from_buff1(w[4], buff[4], W_4);\766 W_from_buff1(w[6], buff[6], W_6);\767 W_from_buff1(w[8], buff[8], W_8);\768 W_from_buff1(w[10],buff[10],W_10);\769 W_from_buff1(w[12],buff[12],W_12);\770 W_from_buff1(w[14],buff[14],W_14);771 772 static word64 mBYTE_FLIP_MASK[] = { 0x0001020304050607, 0x08090a0b0c0d0e0f };773 774 #define W_I_15 xmm14775 #define W_I_7 xmm11776 #define W_I_2 xmm13777 #define W_I xmm12778 #define G_TEMP xmm0779 #define S_TEMP xmm1780 #define XMM_TEMP0 xmm2781 782 #define W_0 xmm12783 #define W_2 xmm3784 #define W_4 xmm4785 #define W_6 xmm5786 #define W_8 xmm6787 #define W_10 xmm7788 #define W_12 xmm8789 #define W_14 xmm9790 791 #define s0_1(dest, src) AVX1_S(dest, src, 1);792 #define s0_2(dest, src) AVX1_S(G_TEMP, src, 8); XOR(dest, G_TEMP, dest);793 #define s0_3(dest, src) AVX1_R(G_TEMP, src, 7); XOR(dest, G_TEMP, dest);794 795 #define s1_1(dest, src) AVX1_S(dest, src, 19);796 #define s1_2(dest, src) AVX1_S(G_TEMP, src, 61); XOR(dest, G_TEMP, dest);797 #define s1_3(dest, src) AVX1_R(G_TEMP, src, 6); XOR(dest, G_TEMP, dest);798 799 #define s0_(dest, src) s0_1(dest, src); s0_2(dest, src); s0_3(dest, src)800 #define s1_(dest, src) s1_1(dest, src); s1_2(dest, src); s1_3(dest, src)801 802 #define Block_xx_1(i) \803 MOVE_to_REG(W_I_15, W_X[(i-15)&15]);\804 MOVE_to_REG(W_I_7, W_X[(i- 7)&15]);\805 806 #define Block_xx_2(i) \807 MOVE_to_REG(W_I_2, W_X[(i- 2)&15]);\808 MOVE_to_REG(W_I, W_X[(i)]);\809 810 #define Block_xx_3(i) \811 s0_ (XMM_TEMP0, W_I_15);\812 813 #define Block_xx_4(i) \814 ADD(W_I, W_I, XMM_TEMP0);\815 ADD(W_I, W_I, W_I_7);\816 817 #define Block_xx_5(i) \818 s1_ (XMM_TEMP0, W_I_2);\819 820 #define Block_xx_6(i) \821 ADD(W_I, W_I, XMM_TEMP0);\822 MOVE_to_MEM(W_X,i, W_I);\823 if (i==0)\824 MOVE_to_MEM(W_X,16, W_I);\825 826 #define Block_xx_7(i) \827 MOVE_to_REG(W_I_15, W_X[(i-15)&15]);\828 MOVE_to_REG(W_I_7, W_X[(i- 7)&15]);\829 830 #define Block_xx_8(i) \831 MOVE_to_REG(W_I_2, W_X[(i- 2)&15]);\832 MOVE_to_REG(W_I, W_X[(i)]);\833 834 #define Block_xx_9(i) \835 s0_ (XMM_TEMP0, W_I_15);\836 837 #define Block_xx_10(i) \838 ADD(W_I, W_I, XMM_TEMP0);\839 ADD(W_I, W_I, W_I_7);\840 841 #define Block_xx_11(i) \842 s1_ (XMM_TEMP0, W_I_2);\843 844 #define Block_xx_12(i) \845 ADD(W_I, W_I, XMM_TEMP0);\846 MOVE_to_MEM(W_X,i, W_I);\847 if ((i)==0)\848 MOVE_to_MEM(W_X,16, W_I);\849 850 static INLINE void Block_0_1(word64 *W_X) { Block_xx_1(0); }851 static INLINE void Block_0_2(word64 *W_X) { Block_xx_2(0); }852 static INLINE void Block_0_3(void) { Block_xx_3(0); }853 static INLINE void Block_0_4(void) { Block_xx_4(0); }854 static INLINE void Block_0_5(void) { Block_xx_5(0); }855 static INLINE void Block_0_6(word64 *W_X) { Block_xx_6(0); }856 static INLINE void Block_0_7(word64 *W_X) { Block_xx_7(2); }857 static INLINE void Block_0_8(word64 *W_X) { Block_xx_8(2); }858 static INLINE void Block_0_9(void) { Block_xx_9(2); }859 static INLINE void Block_0_10(void){ Block_xx_10(2); }860 static INLINE void Block_0_11(void){ Block_xx_11(2); }861 static INLINE void Block_0_12(word64 *W_X){ Block_xx_12(2); }862 863 static INLINE void Block_4_1(word64 *W_X) { Block_xx_1(4); }864 static INLINE void Block_4_2(word64 *W_X) { Block_xx_2(4); }865 static INLINE void Block_4_3(void) { Block_xx_3(4); }866 static INLINE void Block_4_4(void) { Block_xx_4(4); }867 static INLINE void Block_4_5(void) { Block_xx_5(4); }868 static INLINE void Block_4_6(word64 *W_X) { Block_xx_6(4); }869 static INLINE void Block_4_7(word64 *W_X) { Block_xx_7(6); }870 static INLINE void Block_4_8(word64 *W_X) { Block_xx_8(6); }871 static INLINE void Block_4_9(void) { Block_xx_9(6); }872 static INLINE void Block_4_10(void){ Block_xx_10(6); }873 static INLINE void Block_4_11(void){ Block_xx_11(6); }874 static INLINE void Block_4_12(word64 *W_X){ Block_xx_12(6); }875 876 static INLINE void Block_8_1(word64 *W_X) { Block_xx_1(8); }877 static INLINE void Block_8_2(word64 *W_X) { Block_xx_2(8); }878 static INLINE void Block_8_3(void) { Block_xx_3(8); }879 static INLINE void Block_8_4(void) { Block_xx_4(8); }880 static INLINE void Block_8_5(void) { Block_xx_5(8); }881 static INLINE void Block_8_6(word64 *W_X) { Block_xx_6(8); }882 static INLINE void Block_8_7(word64 *W_X) { Block_xx_7(10); }883 static INLINE void Block_8_8(word64 *W_X) { Block_xx_8(10); }884 static INLINE void Block_8_9(void) { Block_xx_9(10); }885 static INLINE void Block_8_10(void){ Block_xx_10(10); }886 static INLINE void Block_8_11(void){ Block_xx_11(10); }887 static INLINE void Block_8_12(word64 *W_X){ Block_xx_12(10); }888 889 static INLINE void Block_12_1(word64 *W_X) { Block_xx_1(12); }890 static INLINE void Block_12_2(word64 *W_X) { Block_xx_2(12); }891 static INLINE void Block_12_3(void) { Block_xx_3(12); }892 static INLINE void Block_12_4(void) { Block_xx_4(12); }893 static INLINE void Block_12_5(void) { Block_xx_5(12); }894 static INLINE void Block_12_6(word64 *W_X) { Block_xx_6(12); }895 static INLINE void Block_12_7(word64 *W_X) { Block_xx_7(14); }896 static INLINE void Block_12_8(word64 *W_X) { Block_xx_8(14); }897 static INLINE void Block_12_9(void) { Block_xx_9(14); }898 static INLINE void Block_12_10(void){ Block_xx_10(14); }899 static INLINE void Block_12_11(void){ Block_xx_11(14); }900 static INLINE void Block_12_12(word64 *W_X){ Block_xx_12(14); }901 902 #endif /* HAVE_INTEL_AVX1 */903 904 #if defined(HAVE_INTEL_AVX2)905 static const unsigned long mBYTE_FLIP_MASK_Y[] =906 { 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f };907 908 #define W_from_buff_Y(buff)\909 { /* X0..3(ymm9..12), W_X[0..15] = sha512->buffer[0.15]; */\910 __asm__ volatile("vmovdqu %0, %%ymm8\n\t"::"m"(mBYTE_FLIP_MASK_Y[0]));\911 __asm__ volatile("vmovdqu %0, %%ymm12\n\t"\912 "vmovdqu %1, %%ymm4\n\t"\913 "vpshufb %%ymm8, %%ymm12, %%ymm12\n\t"\914 "vpshufb %%ymm8, %%ymm4, %%ymm4\n\t"\915 :: "m"(buff[0]), "m"(buff[4]));\916 __asm__ volatile("vmovdqu %0, %%ymm5\n\t"\917 "vmovdqu %1, %%ymm6\n\t"\918 "vpshufb %%ymm8, %%ymm5, %%ymm5\n\t"\919 "vpshufb %%ymm8, %%ymm6, %%ymm6\n\t"\920 :: "m"(buff[8]), "m"(buff[12]));\921 }922 923 #if defined(DEBUG_YMM)924 #define SAVE_REG_Y(i) __asm__ volatile("vmovdqu %%ymm"#i", %0 \n\t":"=m"(reg[i-4][0]):);925 #define RECV_REG_Y(i) __asm__ volatile("vmovdqu %0, %%ymm"#i" \n\t"::"m"(reg[i-4][0]));926 927 #define _DUMP_REG_Y(REG, name)\928 { word64 buf[16];word64 reg[16][2];int k;\929 SAVE_REG_Y(4); SAVE_REG_Y(5); SAVE_REG_Y(6); SAVE_REG_Y(7); \930 SAVE_REG_Y(8); SAVE_REG_Y(9); SAVE_REG_Y(10); SAVE_REG_Y(11); SAVE_REG_Y(12);\931 SAVE_REG_Y(13); SAVE_REG_Y(14); SAVE_REG_Y(15); \932 __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0]):);\933 printf(" "#name":\t"); for(k=0; k<4; k++) printf("%016lx.", (word64)buf[k]); printf("\n"); \934 RECV_REG_Y(4); RECV_REG_Y(5); RECV_REG_Y(6); RECV_REG_Y(7); \935 RECV_REG_Y(8); RECV_REG_Y(9); RECV_REG_Y(10); RECV_REG_Y(11); RECV_REG_Y(12); \936 RECV_REG_Y(13); RECV_REG_Y(14); RECV_REG_Y(15);\937 }938 939 #define DUMP_REG_Y(REG) _DUMP_REG_Y(REG, #REG)940 #define DUMP_REG2_Y(REG) _DUMP_REG_Y(REG, #REG)941 #define PRINTF_Y(fmt, ...)942 #else943 #define DUMP_REG_Y(REG)944 #define DUMP_REG2_Y(REG)945 #define PRINTF_Y(fmt, ...)946 #endif /* DEBUG_YMM */947 948 #define _MOVE_to_REGy(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" "\949 :: "m"(mem));950 #define _MOVE_to_MEMy(mem,i, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" \951 : "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3]):);952 #define _MOVE_128y(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"\953 #map", %%"#ymm2", %%"#ymm1", %%"#ymm0" "::);954 #define _S_TEMPy(dest, src, bits, temp) \955 __asm__ volatile("vpsrlq $"#bits", %%"#src", %%"#dest"\n\tvpsllq $64-"#bits\956 ", %%"#src", %%"#temp"\n\tvpor %%"#temp",%%"#dest", %%"#dest" "::);957 #define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrlq $"#bits", %%"\958 #src", %%"#dest" "::);959 #define _XORy(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\960 #src2", %%"#dest" "::);961 #define _ADDy(dest, src1, src2) __asm__ volatile("vpaddq %%"#src1", %%"\962 #src2", %%"#dest" "::);963 #define _BLENDy(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\964 #src1", %%"#src2", %%"#dest" "::);965 #define _BLENDQy(map, dest, src1, src2) __asm__ volatile("vblendpd $"#map", %%"\966 #src1", %%"#src2", %%"#dest" "::);967 #define _PERMQy(map, dest, src) __asm__ volatile("vpermq $"#map", %%"\968 #src", %%"#dest" "::);969 970 #define MOVE_to_REGy(ymm, mem) _MOVE_to_REGy(ymm, mem)971 #define MOVE_to_MEMy(mem, i, ymm) _MOVE_to_MEMy(mem, i, ymm)972 973 #define MOVE_128y(ymm0, ymm1, ymm2, map) _MOVE_128y(ymm0, ymm1, ymm2, map)974 #define XORy(dest, src1, src2) _XORy(dest, src1, src2)975 #define ADDy(dest, src1, src2) _ADDy(dest, src1, src2)976 #define BLENDy(map, dest, src1, src2) _BLENDy(map, dest, src1, src2)977 #define BLENDQy(map, dest, src1, src2) _BLENDQy(map, dest, src1, src2)978 #define PERMQy(map, dest, src) _PERMQy(map, dest, src)979 980 981 #define S_TMPy(dest, src, bits, temp) _S_TEMPy(dest, src, bits, temp);982 #define AVX2_S(dest, src, bits) S_TMPy(dest, src, bits, S_TEMPy)983 #define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits)984 985 986 #define FEEDBACK1_to_W_I_2(w_i_2, w_i) MOVE_128y(YMM_TEMP0, w_i, w_i, 0x08);\987 BLENDy(0xf0, w_i_2, YMM_TEMP0, w_i_2);988 989 #define MOVE_W_to_W_I_15(w_i_15, w_0, w_4) BLENDQy(0x1, w_i_15, w_4, w_0);\990 PERMQy(0x39, w_i_15, w_i_15);991 #define MOVE_W_to_W_I_7(w_i_7, w_8, w_12) BLENDQy(0x1, w_i_7, w_12, w_8);\992 PERMQy(0x39, w_i_7, w_i_7);993 #define MOVE_W_to_W_I_2(w_i_2, w_12) BLENDQy(0xc, w_i_2, w_12, w_i_2);\994 PERMQy(0x0e, w_i_2, w_i_2);995 996 997 #define W_I_16y ymm8998 #define W_I_15y ymm9999 #define W_I_7y ymm101000 #define W_I_2y ymm111001 #define W_Iy ymm121002 #define G_TEMPy ymm131003 #define S_TEMPy ymm141004 #define YMM_TEMP0 ymm151005 #define YMM_TEMP0x xmm151006 #define W_I_TEMPy ymm71007 #define W_K_TEMPy ymm151008 #define W_K_TEMPx xmm151009 #define W_0y ymm121010 #define W_4y ymm41011 #define W_8y ymm51012 #define W_12y ymm61013 1014 1015 #define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\1016 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" "::);\1017 __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" "::);\1018 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" "::);\1019 __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" "::);\1020 __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" "::);\1021 1022 #define MOVE_7_to_15(w_i_15, w_i_7)\1023 __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" "::);\1024 1025 #define MOVE_I_to_7(w_i_7, w_i)\1026 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" "::);\1027 __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" "::);\1028 __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" "::);\1029 1030 #define MOVE_I_to_2(w_i_2, w_i)\1031 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" "::);\1032 __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" "::);\1033 1034 #endif /* HAVE_INTEL_AVX2 */1035 1409 1036 1410 1037 1411 /*** Transform Body ***/ 1038 1412 #if defined(HAVE_INTEL_AVX1) 1039 static int Transform_ AVX1(wc_Sha512* sha512)1413 static int Transform_Sha512_AVX1(wc_Sha512* sha512) 1040 1414 { 1041 const word64* K = K512; 1042 word64 W_X[16+4] = {0}; 1043 word32 j; 1044 word64 T[8]; 1045 1046 /* Copy digest to working vars */ 1047 XMEMCPY(T, sha512->digest, sizeof(T)); 1048 1049 W_from_buff(W_X, sha512->buffer); 1050 for (j = 0; j < 80; j += 16) { 1051 Rx_1( 0); Block_0_1(W_X); Rx_2( 0); Block_0_2(W_X); Rx_3( 0); Block_0_3(); 1052 Rx_1( 1); Block_0_4(); Rx_2( 1); Block_0_5(); Rx_3( 1); Block_0_6(W_X); 1053 Rx_1( 2); Block_0_7(W_X); Rx_2( 2); Block_0_8(W_X); Rx_3( 2); Block_0_9(); 1054 Rx_1( 3); Block_0_10();Rx_2( 3); Block_0_11();Rx_3( 3); Block_0_12(W_X); 1055 1056 Rx_1( 4); Block_4_1(W_X); Rx_2( 4); Block_4_2(W_X); Rx_3( 4); Block_4_3(); 1057 Rx_1( 5); Block_4_4(); Rx_2( 5); Block_4_5(); Rx_3( 5); Block_4_6(W_X); 1058 Rx_1( 6); Block_4_7(W_X); Rx_2( 6); Block_4_8(W_X); Rx_3( 6); Block_4_9(); 1059 Rx_1( 7); Block_4_10();Rx_2( 7); Block_4_11();Rx_3( 7); Block_4_12(W_X); 1060 1061 Rx_1( 8); Block_8_1(W_X); Rx_2( 8); Block_8_2(W_X); Rx_3( 8); Block_8_3(); 1062 Rx_1( 9); Block_8_4(); Rx_2( 9); Block_8_5(); Rx_3( 9); Block_8_6(W_X); 1063 Rx_1(10); Block_8_7(W_X); Rx_2(10); Block_8_8(W_X); Rx_3(10); Block_8_9(); 1064 Rx_1(11); Block_8_10();Rx_2(11); Block_8_11();Rx_3(11); Block_8_12(W_X); 1065 1066 Rx_1(12); Block_12_1(W_X); Rx_2(12); Block_12_2(W_X); Rx_3(12); Block_12_3(); 1067 Rx_1(13); Block_12_4(); Rx_2(13); Block_12_5(); Rx_3(13); Block_12_6(W_X); 1068 Rx_1(14); Block_12_7(W_X); Rx_2(14); Block_12_8(W_X); Rx_3(14); Block_12_9(); 1069 Rx_1(15); Block_12_10();Rx_2(15); Block_12_11();Rx_3(15); Block_12_12(W_X); 1070 } 1071 1072 /* Add the working vars back into digest */ 1073 sha512->digest[0] += a(0); 1074 sha512->digest[1] += b(0); 1075 sha512->digest[2] += c(0); 1076 sha512->digest[3] += d(0); 1077 sha512->digest[4] += e(0); 1078 sha512->digest[5] += f(0); 1079 sha512->digest[6] += g(0); 1080 sha512->digest[7] += h(0); 1081 1082 /* Wipe variables */ 1083 #if !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2) 1084 XMEMSET(W_X, 0, sizeof(word64) * 16); 1085 #endif 1086 XMEMSET(T, 0, sizeof(T)); 1415 __asm__ __volatile__ ( 1416 1417 /* 16 Ws plus loop counter. */ 1418 "subq $136, %%rsp\n\t" 1419 "leaq 64(%[sha512]), %%rax\n\t" 1420 1421 INIT_MASK(MASK) 1422 LOAD_DIGEST() 1423 1424 LOAD_W(MASK, rax) 1425 1426 "movl $4, 16*8(" WX ")\n\t" 1427 "leaq %[K512], %%rsi\n\t" 1428 /* b */ 1429 "movq %%r9, " L4 "\n\t" 1430 /* e */ 1431 "movq %%r12, " L1 "\n\t" 1432 /* b ^ c */ 1433 "xorq %%r10, " L4 "\n\t" 1434 1435 "# Start of 16 rounds\n" 1436 "1:\n\t" 1437 1438 SET_W_X(rsi) 1439 1440 "addq $128, %%rsi\n\t" 1441 1442 MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0) 1443 MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2) 1444 MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4) 1445 MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6) 1446 MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8) 1447 MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10) 1448 MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12) 1449 MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14) 1450 1451 "subl $1, 16*8(" WX ")\n\t" 1452 "jne 1b\n\t" 1453 1454 SET_W_X(rsi) 1455 1456 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 1457 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) 1458 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) 1459 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) 1460 1461 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) 1462 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) 1463 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) 1464 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 1465 1466 STORE_ADD_DIGEST() 1467 1468 "addq $136, %%rsp\n\t" 1469 1470 : 1471 : [mask] "m" (mBYTE_FLIP_MASK), 1472 [sha512] "r" (sha512), 1473 [K512] "m" (K512) 1474 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi" 1475 ); 1476 1477 return 0; 1478 } 1479 1480 static int Transform_Sha512_AVX1_Len(wc_Sha512* sha512, word32 len) 1481 { 1482 __asm__ __volatile__ ( 1483 1484 "movq 224(%[sha512]), %%rsi\n\t" 1485 "leaq %[K512], %%rdx\n\t" 1486 1487 INIT_MASK(MASK) 1488 LOAD_DIGEST() 1489 1490 "# Start of processing a block\n" 1491 "2:\n\t" 1492 1493 /* 16 Ws plus loop counter and K512. len goes into -4(%rsp). 1494 * Debug needs more stack space. */ 1495 "subq $256, %%rsp\n\t" 1496 1497 LOAD_W(MASK, rsi) 1498 1499 "movl $4, 16*8(" WX ")\n\t" 1500 /* b */ 1501 "movq %%r9, " L4 "\n\t" 1502 /* e */ 1503 "movq %%r12, " L1 "\n\t" 1504 /* b ^ c */ 1505 "xorq %%r10, " L4 "\n\t" 1506 1507 SET_W_X(rdx) 1508 1509 "# Start of 16 rounds\n" 1510 "1:\n\t" 1511 1512 "addq $128, %%rdx\n\t" 1513 "movq %%rdx, 17*8(%%rsp)\n\t" 1514 1515 MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0) 1516 MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2) 1517 MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4) 1518 MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6) 1519 MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8) 1520 MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10) 1521 MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12) 1522 MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14) 1523 1524 "movq 17*8(%%rsp), %%rdx\n\t" 1525 1526 SET_W_X(rdx) 1527 1528 "subl $1, 16*8(" WX ")\n\t" 1529 "jne 1b\n\t" 1530 1531 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 1532 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) 1533 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) 1534 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) 1535 1536 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) 1537 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) 1538 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) 1539 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 1540 1541 ADD_DIGEST() 1542 1543 "addq $256, %%rsp\n\t" 1544 "leaq %[K512], %%rdx\n\t" 1545 "addq $128, %%rsi\n\t" 1546 "subl $128, %[len]\n\t" 1547 1548 STORE_DIGEST() 1549 1550 "jnz 2b\n\t" 1551 1552 : 1553 : [mask] "m" (mBYTE_FLIP_MASK), 1554 [len] "m" (len), 1555 [sha512] "r" (sha512), 1556 [K512] "m" (K512) 1557 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi" 1558 ); 1087 1559 1088 1560 return 0; … … 1090 1562 #endif /* HAVE_INTEL_AVX1 */ 1091 1563 1092 #if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_ AVX1) && defined(HAVE_INTEL_RORX)1093 static int Transform_ AVX1_RORX(wc_Sha512* sha512)1564 #if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX) 1565 static int Transform_Sha512_AVX1_RORX(wc_Sha512* sha512) 1094 1566 { 1095 const word64* K = K512; 1096 word64 W_X[16+4] = {0}; 1097 word32 j; 1098 word64 T[8]; 1099 1100 /* Copy digest to working vars */ 1101 XMEMCPY(T, sha512->digest, sizeof(T)); 1102 1103 W_from_buff(W_X, sha512->buffer); 1104 for (j = 0; j < 80; j += 16) { 1105 Rx_RORX_1( 0); Block_0_1(W_X); Rx_RORX_2( 0); Block_0_2(W_X); 1106 Rx_RORX_3( 0); Block_0_3(); 1107 Rx_RORX_1( 1); Block_0_4(); Rx_RORX_2( 1); Block_0_5(); 1108 Rx_RORX_3( 1); Block_0_6(W_X); 1109 Rx_RORX_1( 2); Block_0_7(W_X); Rx_RORX_2( 2); Block_0_8(W_X); 1110 Rx_RORX_3( 2); Block_0_9(); 1111 Rx_RORX_1( 3); Block_0_10();Rx_RORX_2( 3); Block_0_11(); 1112 Rx_RORX_3( 3); Block_0_12(W_X); 1113 1114 Rx_RORX_1( 4); Block_4_1(W_X); Rx_RORX_2( 4); Block_4_2(W_X); 1115 Rx_RORX_3( 4); Block_4_3(); 1116 Rx_RORX_1( 5); Block_4_4(); Rx_RORX_2( 5); Block_4_5(); 1117 Rx_RORX_3( 5); Block_4_6(W_X); 1118 Rx_RORX_1( 6); Block_4_7(W_X); Rx_RORX_2( 6); Block_4_8(W_X); 1119 Rx_RORX_3( 6); Block_4_9(); 1120 Rx_RORX_1( 7); Block_4_10();Rx_RORX_2( 7); Block_4_11(); 1121 Rx_RORX_3( 7); Block_4_12(W_X); 1122 1123 Rx_RORX_1( 8); Block_8_1(W_X); Rx_RORX_2( 8); Block_8_2(W_X); 1124 Rx_RORX_3( 8); Block_8_3(); 1125 Rx_RORX_1( 9); Block_8_4(); Rx_RORX_2( 9); Block_8_5(); 1126 Rx_RORX_3( 9); Block_8_6(W_X); 1127 Rx_RORX_1(10); Block_8_7(W_X); Rx_RORX_2(10); Block_8_8(W_X); 1128 Rx_RORX_3(10); Block_8_9(); 1129 Rx_RORX_1(11); Block_8_10();Rx_RORX_2(11); Block_8_11(); 1130 Rx_RORX_3(11); Block_8_12(W_X); 1131 1132 Rx_RORX_1(12); Block_12_1(W_X); Rx_RORX_2(12); Block_12_2(W_X); 1133 Rx_RORX_3(12); Block_12_3(); 1134 Rx_RORX_1(13); Block_12_4(); Rx_RORX_2(13); Block_12_5(); 1135 Rx_RORX_3(13); Block_12_6(W_X); 1136 Rx_RORX_1(14); Block_12_7(W_X); Rx_RORX_2(14); Block_12_8(W_X); 1137 Rx_RORX_3(14); Block_12_9(); 1138 Rx_RORX_1(15); Block_12_10();Rx_RORX_2(15); Block_12_11(); 1139 Rx_RORX_3(15); Block_12_12(W_X); 1140 } 1141 1142 /* Add the working vars back into digest */ 1143 sha512->digest[0] += a(0); 1144 sha512->digest[1] += b(0); 1145 sha512->digest[2] += c(0); 1146 sha512->digest[3] += d(0); 1147 sha512->digest[4] += e(0); 1148 sha512->digest[5] += f(0); 1149 sha512->digest[6] += g(0); 1150 sha512->digest[7] += h(0); 1151 1152 /* Wipe variables */ 1153 #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2) 1154 XMEMSET(W_X, 0, sizeof(word64) * 16); 1155 #endif 1156 XMEMSET(T, 0, sizeof(T)); 1567 __asm__ __volatile__ ( 1568 1569 /* 16 Ws plus loop counter and K512. */ 1570 "subq $144, %%rsp\n\t" 1571 "leaq 64(%[sha512]), %%rax\n\t" 1572 1573 INIT_MASK(MASK) 1574 LOAD_DIGEST() 1575 1576 LOAD_W(MASK, rax) 1577 1578 "movl $4, 16*8(" WX ")\n\t" 1579 "leaq %[K512], %%rsi\n\t" 1580 /* L4 = b */ 1581 "movq %%r9, " L4 "\n\t" 1582 /* L3 = 0 (add to prev h) */ 1583 "xorq " L3 ", " L3 "\n\t" 1584 /* L4 = b ^ c */ 1585 "xorq %%r10, " L4 "\n\t" 1586 1587 SET_W_X(rsi) 1588 1589 "# Start of 16 rounds\n" 1590 "1:\n\t" 1591 1592 "addq $128, %%rsi\n\t" 1593 1594 MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0) 1595 MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2) 1596 MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4) 1597 MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6) 1598 MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8) 1599 MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10) 1600 MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12) 1601 MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14) 1602 1603 SET_W_X(rsi) 1604 1605 "subl $1, 16*8(" WX ")\n\t" 1606 "jne 1b\n\t" 1607 1608 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 1609 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) 1610 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) 1611 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) 1612 1613 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) 1614 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) 1615 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) 1616 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 1617 1618 /* Prev RND: h += Maj(a,b,c) */ 1619 "addq " L3 ", %%r8\n\t" 1620 "addq $144, %%rsp\n\t" 1621 1622 STORE_ADD_DIGEST() 1623 1624 : 1625 : [mask] "m" (mBYTE_FLIP_MASK), 1626 [sha512] "r" (sha512), 1627 [K512] "m" (K512) 1628 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi" 1629 ); 1630 1631 return 0; 1632 } 1633 1634 static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512* sha512, word32 len) 1635 { 1636 __asm__ __volatile__ ( 1637 1638 "movq 224(%[sha512]), %%rsi\n\t" 1639 "leaq %[K512], %%rcx\n\t" 1640 1641 INIT_MASK(MASK) 1642 LOAD_DIGEST() 1643 1644 "# Start of processing a block\n" 1645 "2:\n\t" 1646 1647 /* 16 Ws plus loop counter and K512. len goes into -4(%rsp). 1648 * Debug needs more stack space. */ 1649 "subq $256, %%rsp\n\t" 1650 1651 LOAD_W(MASK, rsi) 1652 1653 "movl $4, 16*8(" WX ")\n\t" 1654 /* L4 = b */ 1655 "movq %%r9, " L4 "\n\t" 1656 /* L3 = 0 (add to prev h) */ 1657 "xorq " L3 ", " L3 "\n\t" 1658 /* L4 = b ^ c */ 1659 "xorq %%r10, " L4 "\n\t" 1660 1661 SET_W_X(rcx) 1662 1663 "# Start of 16 rounds\n" 1664 "1:\n\t" 1665 1666 "addq $128, %%rcx\n\t" 1667 "movq %%rcx, 17*8(%%rsp)\n\t" 1668 1669 MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0) 1670 MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2) 1671 MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4) 1672 MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6) 1673 MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8) 1674 MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10) 1675 MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12) 1676 MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14) 1677 1678 "movq 17*8(%%rsp), %%rcx\n\t" 1679 1680 SET_W_X(rcx) 1681 1682 "subl $1, 16*8(" WX ")\n\t" 1683 "jne 1b\n\t" 1684 1685 SET_W_X(rcx) 1686 1687 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 1688 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) 1689 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) 1690 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) 1691 1692 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) 1693 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) 1694 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) 1695 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 1696 1697 /* Prev RND: h += Maj(a,b,c) */ 1698 "addq " L3 ", %%r8\n\t" 1699 "addq $256, %%rsp\n\t" 1700 1701 ADD_DIGEST() 1702 1703 "leaq %[K512], %%rcx\n\t" 1704 "addq $128, %%rsi\n\t" 1705 "subl $128, %[len]\n\t" 1706 1707 STORE_DIGEST() 1708 1709 "jnz 2b\n\t" 1710 1711 : 1712 : [mask] "m" (mBYTE_FLIP_MASK), 1713 [len] "m" (len), 1714 [sha512] "r" (sha512), 1715 [K512] "m" (K512) 1716 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi" 1717 ); 1157 1718 1158 1719 return 0; 1159 1720 } 1160 #endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_ AVX1 && HAVE_INTEL_RORX */1721 #endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_RORX */ 1161 1722 1162 1723 #if defined(HAVE_INTEL_AVX2) 1163 1164 #define s0_1y(dest, src) AVX2_S(dest, src, 1); 1165 #define s0_2y(dest, src) AVX2_S(G_TEMPy, src, 8); XORy(dest, G_TEMPy, dest); 1166 #define s0_3y(dest, src) AVX2_R(G_TEMPy, src, 7); XORy(dest, G_TEMPy, dest); 1167 1168 #define s1_1y(dest, src) AVX2_S(dest, src, 19); 1169 #define s1_2y(dest, src) AVX2_S(G_TEMPy, src, 61); XORy(dest, G_TEMPy, dest); 1170 #define s1_3y(dest, src) AVX2_R(G_TEMPy, src, 6); XORy(dest, G_TEMPy, dest); 1171 1172 #define s0_y(dest, src) s0_1y(dest, src); s0_2y(dest, src); s0_3y(dest, src) 1173 #define s1_y(dest, src) s1_1y(dest, src); s1_2y(dest, src); s1_3y(dest, src) 1174 1175 1176 #define Block_Y_xx_1(i, w_0, w_4, w_8, w_12)\ 1177 MOVE_W_to_W_I_15(W_I_15y, w_0, w_4);\ 1178 MOVE_W_to_W_I_7 (W_I_7y, w_8, w_12);\ 1179 MOVE_W_to_W_I_2 (W_I_2y, w_12);\ 1180 1181 #define Block_Y_xx_2(i, w_0, w_4, w_8, w_12)\ 1182 s0_1y (YMM_TEMP0, W_I_15y);\ 1183 1184 #define Block_Y_xx_3(i, w_0, w_4, w_8, w_12)\ 1185 s0_2y (YMM_TEMP0, W_I_15y);\ 1186 1187 #define Block_Y_xx_4(i, w_0, w_4, w_8, w_12)\ 1188 s0_3y (YMM_TEMP0, W_I_15y);\ 1189 1190 #define Block_Y_xx_5(i, w_0, w_4, w_8, w_12)\ 1191 ADDy(W_I_TEMPy, w_0, YMM_TEMP0);\ 1192 1193 #define Block_Y_xx_6(i, w_0, w_4, w_8, w_12)\ 1194 ADDy(W_I_TEMPy, W_I_TEMPy, W_I_7y);\ 1195 s1_1y (YMM_TEMP0, W_I_2y);\ 1196 1197 #define Block_Y_xx_7(i, w_0, w_4, w_8, w_12)\ 1198 s1_2y (YMM_TEMP0, W_I_2y);\ 1199 1200 #define Block_Y_xx_8(i, w_0, w_4, w_8, w_12)\ 1201 s1_3y (YMM_TEMP0, W_I_2y);\ 1202 ADDy(w_0, W_I_TEMPy, YMM_TEMP0);\ 1203 1204 #define Block_Y_xx_9(i, w_0, w_4, w_8, w_12)\ 1205 FEEDBACK1_to_W_I_2(W_I_2y, w_0);\ 1206 1207 #define Block_Y_xx_10(i, w_0, w_4, w_8, w_12) \ 1208 s1_1y (YMM_TEMP0, W_I_2y);\ 1209 1210 #define Block_Y_xx_11(i, w_0, w_4, w_8, w_12) \ 1211 s1_2y (YMM_TEMP0, W_I_2y);\ 1212 1213 #define Block_Y_xx_12(i, w_0, w_4, w_8, w_12)\ 1214 s1_3y (YMM_TEMP0, W_I_2y);\ 1215 ADDy(w_0, W_I_TEMPy, YMM_TEMP0);\ 1216 MOVE_to_MEMy(w,0, w_4);\ 1217 1218 1219 static INLINE void Block_Y_0_1(void) { Block_Y_xx_1(0, W_0y, W_4y, W_8y, W_12y); } 1220 static INLINE void Block_Y_0_2(void) { Block_Y_xx_2(0, W_0y, W_4y, W_8y, W_12y); } 1221 static INLINE void Block_Y_0_3(void) { Block_Y_xx_3(0, W_0y, W_4y, W_8y, W_12y); } 1222 static INLINE void Block_Y_0_4(void) { Block_Y_xx_4(0, W_0y, W_4y, W_8y, W_12y); } 1223 static INLINE void Block_Y_0_5(void) { Block_Y_xx_5(0, W_0y, W_4y, W_8y, W_12y); } 1224 static INLINE void Block_Y_0_6(void) { Block_Y_xx_6(0, W_0y, W_4y, W_8y, W_12y); } 1225 static INLINE void Block_Y_0_7(void) { Block_Y_xx_7(0, W_0y, W_4y, W_8y, W_12y); } 1226 static INLINE void Block_Y_0_8(void) { Block_Y_xx_8(0, W_0y, W_4y, W_8y, W_12y); } 1227 static INLINE void Block_Y_0_9(void) { Block_Y_xx_9(0, W_0y, W_4y, W_8y, W_12y); } 1228 static INLINE void Block_Y_0_10(void){ Block_Y_xx_10(0, W_0y, W_4y, W_8y, W_12y); } 1229 static INLINE void Block_Y_0_11(void){ Block_Y_xx_11(0, W_0y, W_4y, W_8y, W_12y); } 1230 static INLINE void Block_Y_0_12(word64 *w){ Block_Y_xx_12(0, W_0y, W_4y, W_8y, W_12y); } 1231 1232 static INLINE void Block_Y_4_1(void) { Block_Y_xx_1(4, W_4y, W_8y, W_12y, W_0y); } 1233 static INLINE void Block_Y_4_2(void) { Block_Y_xx_2(4, W_4y, W_8y, W_12y, W_0y); } 1234 static INLINE void Block_Y_4_3(void) { Block_Y_xx_3(4, W_4y, W_8y, W_12y, W_0y); } 1235 static INLINE void Block_Y_4_4(void) { Block_Y_xx_4(4, W_4y, W_8y, W_12y, W_0y); } 1236 static INLINE void Block_Y_4_5(void) { Block_Y_xx_5(4, W_4y, W_8y, W_12y, W_0y); } 1237 static INLINE void Block_Y_4_6(void) { Block_Y_xx_6(4, W_4y, W_8y, W_12y, W_0y); } 1238 static INLINE void Block_Y_4_7(void) { Block_Y_xx_7(4, W_4y, W_8y, W_12y, W_0y); } 1239 static INLINE void Block_Y_4_8(void) { Block_Y_xx_8(4, W_4y, W_8y, W_12y, W_0y); } 1240 static INLINE void Block_Y_4_9(void) { Block_Y_xx_9(4, W_4y, W_8y, W_12y, W_0y); } 1241 static INLINE void Block_Y_4_10(void) { Block_Y_xx_10(4, W_4y, W_8y, W_12y, W_0y); } 1242 static INLINE void Block_Y_4_11(void) { Block_Y_xx_11(4, W_4y, W_8y, W_12y, W_0y); } 1243 static INLINE void Block_Y_4_12(word64 *w) { Block_Y_xx_12(4, W_4y, W_8y, W_12y, W_0y); } 1244 1245 static INLINE void Block_Y_8_1(void) { Block_Y_xx_1(8, W_8y, W_12y, W_0y, W_4y); } 1246 static INLINE void Block_Y_8_2(void) { Block_Y_xx_2(8, W_8y, W_12y, W_0y, W_4y); } 1247 static INLINE void Block_Y_8_3(void) { Block_Y_xx_3(8, W_8y, W_12y, W_0y, W_4y); } 1248 static INLINE void Block_Y_8_4(void) { Block_Y_xx_4(8, W_8y, W_12y, W_0y, W_4y); } 1249 static INLINE void Block_Y_8_5(void) { Block_Y_xx_5(8, W_8y, W_12y, W_0y, W_4y); } 1250 static INLINE void Block_Y_8_6(void) { Block_Y_xx_6(8, W_8y, W_12y, W_0y, W_4y); } 1251 static INLINE void Block_Y_8_7(void) { Block_Y_xx_7(8, W_8y, W_12y, W_0y, W_4y); } 1252 static INLINE void Block_Y_8_8(void) { Block_Y_xx_8(8, W_8y, W_12y, W_0y, W_4y); } 1253 static INLINE void Block_Y_8_9(void) { Block_Y_xx_9(8, W_8y, W_12y, W_0y, W_4y); } 1254 static INLINE void Block_Y_8_10(void) { Block_Y_xx_10(8, W_8y, W_12y, W_0y, W_4y); } 1255 static INLINE void Block_Y_8_11(void) { Block_Y_xx_11(8, W_8y, W_12y, W_0y, W_4y); } 1256 static INLINE void Block_Y_8_12(word64 *w) { Block_Y_xx_12(8, W_8y, W_12y, W_0y, W_4y); } 1257 1258 static INLINE void Block_Y_12_1(void) { Block_Y_xx_1(12, W_12y, W_0y, W_4y, W_8y); } 1259 static INLINE void Block_Y_12_2(void) { Block_Y_xx_2(12, W_12y, W_0y, W_4y, W_8y); } 1260 static INLINE void Block_Y_12_3(void) { Block_Y_xx_3(12, W_12y, W_0y, W_4y, W_8y); } 1261 static INLINE void Block_Y_12_4(void) { Block_Y_xx_4(12, W_12y, W_0y, W_4y, W_8y); } 1262 static INLINE void Block_Y_12_5(void) { Block_Y_xx_5(12, W_12y, W_0y, W_4y, W_8y); } 1263 static INLINE void Block_Y_12_6(void) { Block_Y_xx_6(12, W_12y, W_0y, W_4y, W_8y); } 1264 static INLINE void Block_Y_12_7(void) { Block_Y_xx_7(12, W_12y, W_0y, W_4y, W_8y); } 1265 static INLINE void Block_Y_12_8(void) { Block_Y_xx_8(12, W_12y, W_0y, W_4y, W_8y); } 1266 static INLINE void Block_Y_12_9(void) { Block_Y_xx_9(12, W_12y, W_0y, W_4y, W_8y); } 1267 static INLINE void Block_Y_12_10(void) { Block_Y_xx_10(12, W_12y, W_0y, W_4y, W_8y); } 1268 static INLINE void Block_Y_12_11(void) { Block_Y_xx_11(12, W_12y, W_0y, W_4y, W_8y); } 1269 static INLINE void Block_Y_12_12(word64 *w) { Block_Y_xx_12(12, W_12y, W_0y, W_4y, W_8y); } 1270 1271 1272 static int Transform_AVX2(wc_Sha512* sha512) 1724 static const unsigned long mBYTE_FLIP_MASK_Y[] = 1725 { 0x0001020304050607, 0x08090a0b0c0d0e0f, 1726 0x0001020304050607, 0x08090a0b0c0d0e0f }; 1727 1728 #define W_Y_0 ymm0 1729 #define W_Y_4 ymm1 1730 #define W_Y_8 ymm2 1731 #define W_Y_12 ymm3 1732 1733 #define X0 xmm0 1734 #define X1 xmm1 1735 #define X2 xmm2 1736 #define X3 xmm3 1737 #define X4 xmm4 1738 #define X5 xmm5 1739 #define X6 xmm6 1740 #define X7 xmm7 1741 #define X8 xmm8 1742 #define X9 xmm9 1743 #define Y0 ymm0 1744 #define Y1 ymm1 1745 #define Y2 ymm2 1746 #define Y3 ymm3 1747 #define Y4 ymm4 1748 #define Y5 ymm5 1749 #define Y6 ymm6 1750 #define Y7 ymm7 1751 1752 #define W_Y_M15 ymm12 1753 #define W_Y_M7 ymm13 1754 #define W_Y_M2 ymm14 1755 #define MASK_Y ymm15 1756 1757 #define YTMP1 ymm8 1758 #define YTMP2 ymm9 1759 #define YTMP3 ymm10 1760 #define YTMP4 ymm11 1761 1762 #define YMM_REGS \ 1763 "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", \ 1764 "xmm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15" 1765 1766 #define _VPERM2I128(dest, src1, src2, sel) \ 1767 "vperm2I128 $" #sel ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t" 1768 #define VPERM2I128(dest, src1, src2, sel) \ 1769 _VPERM2I128(dest, src1, src2, sel) 1770 1771 #define _VPERMQ(dest, src, sel) \ 1772 "vpermq $" #sel ", %%" #src ", %%" #dest "\n\t" 1773 #define VPERMQ(dest, src, sel) \ 1774 _VPERMQ(dest, src, sel) 1775 1776 #define _VPBLENDD(dest, src1, src2, sel) \ 1777 "vpblendd $" #sel ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t" 1778 #define VPBLENDD(dest, src1, src2, sel) \ 1779 _VPBLENDD(dest, src1, src2, sel) 1780 1781 #define _V_ADD_I(dest, src1, addr, i) \ 1782 "vpaddq "#i"*8(%%" #addr "), %%" #src1 ", %%" #dest "\n\t" 1783 #define V_ADD_I(dest, src1, addr, i) \ 1784 _V_ADD_I(dest, src1, addr, i) 1785 1786 #define _VMOVDQU_I(addr, i, src) \ 1787 "vmovdqu %%" #src ", " #i "*8(%%" #addr ")\n\t" 1788 #define VMOVDQU_I(addr, i, src) \ 1789 _VMOVDQU_I(addr, i, src) 1790 1791 #define MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \ 1792 RND_0_1(a,b,c,d,e,f,g,h,i) \ 1793 /* W[-13]..W[-15], W[-12] */ \ 1794 VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03) \ 1795 /* W[-5]..W[-7], W[-4] */ \ 1796 VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03) \ 1797 RND_0_2(a,b,c,d,e,f,g,h,i) \ 1798 RND_0_3(a,b,c,d,e,f,g,h,i) \ 1799 /* W_Y_M15 = W[-12]..W[-15] */ \ 1800 VPERMQ(W_Y_M15, W_Y_M15, 0x39) \ 1801 RND_0_4(a,b,c,d,e,f,g,h,i) \ 1802 /* W_Y_M7 = W[-4]..W[-7] */ \ 1803 VPERMQ(W_Y_M7, W_Y_M7, 0x39) \ 1804 RND_0_5(a,b,c,d,e,f,g,h,i) \ 1805 RND_0_6(a,b,c,d,e,f,g,h,i) \ 1806 /* W[-15] >> 1 */ \ 1807 V_SHIFT_R(YTMP1, W_Y_M15, 1) \ 1808 RND_0_7(a,b,c,d,e,f,g,h,i) \ 1809 /* W[-15] << 63 */ \ 1810 V_SHIFT_L(YTMP2, W_Y_M15, 63) \ 1811 RND_0_8(a,b,c,d,e,f,g,h,i) \ 1812 /* W[-15] >> 8 */ \ 1813 V_SHIFT_R(YTMP3, W_Y_M15, 8) \ 1814 RND_0_9(a,b,c,d,e,f,g,h,i) \ 1815 /* W[-15] << 56 */ \ 1816 V_SHIFT_L(YTMP4, W_Y_M15, 56) \ 1817 RND_0_10(a,b,c,d,e,f,g,h,i) \ 1818 /* W[-15] >>> 1 */ \ 1819 V_OR(YTMP1, YTMP2, YTMP1) \ 1820 RND_0_11(a,b,c,d,e,f,g,h,i) \ 1821 /* W[-15] >>> 8 */ \ 1822 V_OR(YTMP3, YTMP4, YTMP3) \ 1823 RND_0_12(a,b,c,d,e,f,g,h,i) \ 1824 RND_1_1(h,a,b,c,d,e,f,g,i+1) \ 1825 /* W[-15] >> 7 */ \ 1826 V_SHIFT_R(YTMP4, W_Y_M15, 7) \ 1827 RND_1_2_A(h,a,b,c,d,e,f,g,i+1) \ 1828 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */ \ 1829 V_XOR(YTMP1, YTMP3, YTMP1) \ 1830 RND_1_2_B(h,a,b,c,d,e,f,g,i+1) \ 1831 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */ \ 1832 V_XOR(YTMP1, YTMP4, YTMP1) \ 1833 RND_1_3(h,a,b,c,d,e,f,g,i+1) \ 1834 /* W[0] = W[-16] + W[-7] */ \ 1835 V_ADD(W_Y_0, W_Y_0, W_Y_M7) \ 1836 RND_1_4(h,a,b,c,d,e,f,g,i+1) \ 1837 /* W[0] = W[-16] + W[-7] + s0(W[-15]) */ \ 1838 V_ADD(W_Y_0, W_Y_0, YTMP1) \ 1839 RND_1_5(h,a,b,c,d,e,f,g,i+1) \ 1840 /* 0, 0, W[-1], W[-2] */ \ 1841 VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81) \ 1842 RND_1_6(h,a,b,c,d,e,f,g,i+1) \ 1843 RND_1_7(h,a,b,c,d,e,f,g,i+1) \ 1844 RND_1_8(h,a,b,c,d,e,f,g,i+1) \ 1845 /* W[-2] >> 19 */ \ 1846 V_SHIFT_R(YTMP1, W_Y_M2, 19) \ 1847 RND_1_9(h,a,b,c,d,e,f,g,i+1) \ 1848 /* W[-2] << 45 */ \ 1849 V_SHIFT_L(YTMP2, W_Y_M2, 45) \ 1850 RND_1_10(h,a,b,c,d,e,f,g,i+1) \ 1851 /* W[-2] >> 61 */ \ 1852 V_SHIFT_R(YTMP3, W_Y_M2, 61) \ 1853 RND_1_11(h,a,b,c,d,e,f,g,i+1) \ 1854 /* W[-2] << 3 */ \ 1855 V_SHIFT_L(YTMP4, W_Y_M2, 3) \ 1856 RND_1_12(h,a,b,c,d,e,f,g,i+1) \ 1857 RND_0_1(g,h,a,b,c,d,e,f,i+2) \ 1858 /* W[-2] >>> 19 */ \ 1859 V_OR(YTMP1, YTMP2, YTMP1) \ 1860 RND_0_2(g,h,a,b,c,d,e,f,i+2) \ 1861 /* W[-2] >>> 61 */ \ 1862 V_OR(YTMP3, YTMP4, YTMP3) \ 1863 RND_0_3(g,h,a,b,c,d,e,f,i+2) \ 1864 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \ 1865 V_XOR(YTMP1, YTMP3, YTMP1) \ 1866 RND_0_4(g,h,a,b,c,d,e,f,i+2) \ 1867 /* W[-2] >> 6 */ \ 1868 V_SHIFT_R(YTMP4, W_Y_M2, 6) \ 1869 RND_0_5(g,h,a,b,c,d,e,f,i+2) \ 1870 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \ 1871 V_XOR(YTMP1, YTMP4, YTMP1) \ 1872 RND_0_6(g,h,a,b,c,d,e,f,i+2) \ 1873 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \ 1874 V_ADD(W_Y_0, W_Y_0, YTMP1) \ 1875 RND_0_7(g,h,a,b,c,d,e,f,i+2) \ 1876 RND_0_8(g,h,a,b,c,d,e,f,i+2) \ 1877 /* W[1], W[0], 0, 0 */ \ 1878 VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08) \ 1879 RND_0_9(g,h,a,b,c,d,e,f,i+2) \ 1880 RND_0_10(g,h,a,b,c,d,e,f,i+2) \ 1881 /* W[-2] >> 19 */ \ 1882 V_SHIFT_R(YTMP1, W_Y_M2, 19) \ 1883 RND_0_11(g,h,a,b,c,d,e,f,i+2) \ 1884 /* W[-2] << 45 */ \ 1885 V_SHIFT_L(YTMP2, W_Y_M2, 45) \ 1886 RND_0_12(g,h,a,b,c,d,e,f,i+2) \ 1887 RND_1_1(f,g,h,a,b,c,d,e,i+3) \ 1888 /* W[-2] >> 61 */ \ 1889 V_SHIFT_R(YTMP3, W_Y_M2, 61) \ 1890 RND_1_2(f,g,h,a,b,c,d,e,i+3) \ 1891 /* W[-2] << 3 */ \ 1892 V_SHIFT_L(YTMP4, W_Y_M2, 3) \ 1893 RND_1_3(f,g,h,a,b,c,d,e,i+3) \ 1894 /* W[-2] >>> 19 */ \ 1895 V_OR(YTMP1, YTMP2, YTMP1) \ 1896 RND_1_4(f,g,h,a,b,c,d,e,i+3) \ 1897 /* W[-2] >>> 61 */ \ 1898 V_OR(YTMP3, YTMP4, YTMP3) \ 1899 RND_1_5(f,g,h,a,b,c,d,e,i+3) \ 1900 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \ 1901 V_XOR(YTMP1, YTMP3, YTMP1) \ 1902 RND_1_6(f,g,h,a,b,c,d,e,i+3) \ 1903 /* W[-2] >> 6 */ \ 1904 V_SHIFT_R(YTMP4, W_Y_M2, 6) \ 1905 RND_1_7(f,g,h,a,b,c,d,e,i+3) \ 1906 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \ 1907 V_XOR(YTMP1, YTMP4, YTMP1) \ 1908 RND_1_8(f,g,h,a,b,c,d,e,i+3) \ 1909 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \ 1910 V_ADD(W_Y_0, W_Y_0, YTMP1) \ 1911 RND_1_9(f,g,h,a,b,c,d,e,i+3) \ 1912 RND_1_10(f,g,h,a,b,c,d,e,i+3) \ 1913 RND_1_11(f,g,h,a,b,c,d,e,i+3) \ 1914 RND_1_12(f,g,h,a,b,c,d,e,i+3) \ 1915 1916 #define MsgSched2_AVX2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \ 1917 RND_0_1(a,b,c,d,e,f,g,h,i) \ 1918 VPALIGNR(W_Y_M15, W_2, W_0, 8) \ 1919 VPALIGNR(W_Y_M7, W_10, W_8, 8) \ 1920 RND_0_2(a,b,c,d,e,f,g,h,i) \ 1921 V_SHIFT_R(YTMP1, W_Y_M15, 1) \ 1922 V_SHIFT_L(YTMP2, W_Y_M15, 63) \ 1923 RND_0_3(a,b,c,d,e,f,g,h,i) \ 1924 RND_0_4(a,b,c,d,e,f,g,h,i) \ 1925 V_SHIFT_R(YTMP3, W_Y_M15, 8) \ 1926 V_SHIFT_L(YTMP4, W_Y_M15, 56) \ 1927 RND_0_5(a,b,c,d,e,f,g,h,i) \ 1928 RND_0_6(a,b,c,d,e,f,g,h,i) \ 1929 V_OR(YTMP1, YTMP2, YTMP1) \ 1930 V_OR(YTMP3, YTMP4, YTMP3) \ 1931 RND_0_7(a,b,c,d,e,f,g,h,i) \ 1932 RND_0_8(a,b,c,d,e,f,g,h,i) \ 1933 V_SHIFT_R(YTMP4, W_Y_M15, 7) \ 1934 V_XOR(YTMP1, YTMP3, YTMP1) \ 1935 RND_0_9(a,b,c,d,e,f,g,h,i) \ 1936 RND_0_10(a,b,c,d,e,f,g,h,i) \ 1937 V_XOR(YTMP1, YTMP4, YTMP1) \ 1938 V_ADD(W_0, W_0, W_Y_M7) \ 1939 RND_0_11(a,b,c,d,e,f,g,h,i) \ 1940 RND_0_12(a,b,c,d,e,f,g,h,i) \ 1941 RND_1_1(h,a,b,c,d,e,f,g,i+1) \ 1942 V_ADD(W_0, W_0, YTMP1) \ 1943 RND_1_2(h,a,b,c,d,e,f,g,i+1) \ 1944 V_SHIFT_R(YTMP1, W_14, 19) \ 1945 V_SHIFT_L(YTMP2, W_14, 45) \ 1946 RND_1_3(h,a,b,c,d,e,f,g,i+1) \ 1947 RND_1_4(h,a,b,c,d,e,f,g,i+1) \ 1948 V_SHIFT_R(YTMP3, W_14, 61) \ 1949 V_SHIFT_L(YTMP4, W_14, 3) \ 1950 RND_1_5(h,a,b,c,d,e,f,g,i+1) \ 1951 RND_1_6(h,a,b,c,d,e,f,g,i+1) \ 1952 RND_1_7(h,a,b,c,d,e,f,g,i+1) \ 1953 V_OR(YTMP1, YTMP2, YTMP1) \ 1954 V_OR(YTMP3, YTMP4, YTMP3) \ 1955 RND_1_8(h,a,b,c,d,e,f,g,i+1) \ 1956 RND_1_9(h,a,b,c,d,e,f,g,i+1) \ 1957 V_XOR(YTMP1, YTMP3, YTMP1) \ 1958 V_SHIFT_R(YTMP4, W_14, 6) \ 1959 RND_1_10(h,a,b,c,d,e,f,g,i+1) \ 1960 RND_1_11(h,a,b,c,d,e,f,g,i+1) \ 1961 V_XOR(YTMP1, YTMP4, YTMP1) \ 1962 RND_1_12(h,a,b,c,d,e,f,g,i+1) \ 1963 V_ADD(W_0, W_0, YTMP1) \ 1964 1965 #define MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \ 1966 RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \ 1967 /* W[-13]..W[-15], W[-12] */ \ 1968 VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03) \ 1969 /* W[-5]..W[-7], W[-4] */ \ 1970 VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03) \ 1971 RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \ 1972 /* W_Y_M15 = W[-12]..W[-15] */ \ 1973 VPERMQ(W_Y_M15, W_Y_M15, 0x39) \ 1974 RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \ 1975 /* W_Y_M7 = W[-4]..W[-7] */ \ 1976 VPERMQ(W_Y_M7, W_Y_M7, 0x39) \ 1977 RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \ 1978 /* W[-15] >> 1 */ \ 1979 V_SHIFT_R(YTMP1, W_Y_M15, 1) \ 1980 /* W[-15] << 63 */ \ 1981 V_SHIFT_L(YTMP2, W_Y_M15, 63) \ 1982 RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \ 1983 /* W[-15] >> 8 */ \ 1984 V_SHIFT_R(YTMP3, W_Y_M15, 8) \ 1985 /* W[-15] << 56 */ \ 1986 V_SHIFT_L(YTMP4, W_Y_M15, 56) \ 1987 /* W[-15] >>> 1 */ \ 1988 V_OR(YTMP1, YTMP2, YTMP1) \ 1989 /* W[-15] >>> 8 */ \ 1990 V_OR(YTMP3, YTMP4, YTMP3) \ 1991 RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \ 1992 /* W[-15] >> 7 */ \ 1993 V_SHIFT_R(YTMP4, W_Y_M15, 7) \ 1994 RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \ 1995 /* 0, 0, W[-1], W[-2] */ \ 1996 VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81) \ 1997 RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \ 1998 RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \ 1999 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */ \ 2000 V_XOR(YTMP1, YTMP3, YTMP1) \ 2001 RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \ 2002 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */ \ 2003 V_XOR(YTMP1, YTMP4, YTMP1) \ 2004 RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \ 2005 /* W[0] = W[-16] + W[-7] */ \ 2006 V_ADD(W_Y_0, W_Y_0, W_Y_M7) \ 2007 /* W[0] = W[-16] + W[-7] + s0(W[-15]) */ \ 2008 V_ADD(W_Y_0, W_Y_0, YTMP1) \ 2009 RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \ 2010 /* W[-2] >> 19 */ \ 2011 V_SHIFT_R(YTMP1, W_Y_M2, 19) \ 2012 /* W[-2] << 45 */ \ 2013 V_SHIFT_L(YTMP2, W_Y_M2, 45) \ 2014 RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \ 2015 /* W[-2] >> 61 */ \ 2016 V_SHIFT_R(YTMP3, W_Y_M2, 61) \ 2017 /* W[-2] << 3 */ \ 2018 V_SHIFT_L(YTMP4, W_Y_M2, 3) \ 2019 /* W[-2] >>> 19 */ \ 2020 V_OR(YTMP1, YTMP2, YTMP1) \ 2021 RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \ 2022 /* W[-2] >>> 61 */ \ 2023 V_OR(YTMP3, YTMP4, YTMP3) \ 2024 RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \ 2025 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \ 2026 V_XOR(YTMP1, YTMP3, YTMP1) \ 2027 RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \ 2028 /* W[-2] >> 6 */ \ 2029 V_SHIFT_R(YTMP4, W_Y_M2, 6) \ 2030 RND_RORX_0_1(g,h,a,b,c,d,e,f,i+2) \ 2031 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \ 2032 V_XOR(YTMP1, YTMP4, YTMP1) \ 2033 RND_RORX_0_2(g,h,a,b,c,d,e,f,i+2) \ 2034 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \ 2035 V_ADD(W_Y_0, W_Y_0, YTMP1) \ 2036 RND_RORX_0_3(g,h,a,b,c,d,e,f,i+2) \ 2037 /* W[1], W[0], 0, 0 */ \ 2038 VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08) \ 2039 RND_RORX_0_4(g,h,a,b,c,d,e,f,i+2) \ 2040 RND_RORX_0_5(g,h,a,b,c,d,e,f,i+2) \ 2041 /* W[-2] >> 19 */ \ 2042 V_SHIFT_R(YTMP1, W_Y_M2, 19) \ 2043 /* W[-2] << 45 */ \ 2044 V_SHIFT_L(YTMP2, W_Y_M2, 45) \ 2045 RND_RORX_0_6(g,h,a,b,c,d,e,f,i+2) \ 2046 /* W[-2] >> 61 */ \ 2047 V_SHIFT_R(YTMP3, W_Y_M2, 61) \ 2048 /* W[-2] << 3 */ \ 2049 V_SHIFT_L(YTMP4, W_Y_M2, 3) \ 2050 /* W[-2] >>> 19 */ \ 2051 V_OR(YTMP1, YTMP2, YTMP1) \ 2052 RND_RORX_0_7(g,h,a,b,c,d,e,f,i+2) \ 2053 /* W[-2] >>> 61 */ \ 2054 V_OR(YTMP3, YTMP4, YTMP3) \ 2055 RND_RORX_0_8(g,h,a,b,c,d,e,f,i+2) \ 2056 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \ 2057 V_XOR(YTMP1, YTMP3, YTMP1) \ 2058 RND_RORX_1_1(f,g,h,a,b,c,d,e,i+3) \ 2059 /* W[-2] >> 6 */ \ 2060 V_SHIFT_R(YTMP4, W_Y_M2, 6) \ 2061 RND_RORX_1_2(f,g,h,a,b,c,d,e,i+3) \ 2062 RND_RORX_1_3(f,g,h,a,b,c,d,e,i+3) \ 2063 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \ 2064 V_XOR(YTMP1, YTMP4, YTMP1) \ 2065 RND_RORX_1_4(f,g,h,a,b,c,d,e,i+3) \ 2066 RND_RORX_1_5(f,g,h,a,b,c,d,e,i+3) \ 2067 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \ 2068 V_ADD(W_Y_0, W_Y_0, YTMP1) \ 2069 RND_RORX_1_6(f,g,h,a,b,c,d,e,i+3) \ 2070 V_ADD_I(YTMP1, W_Y_0, rsi, i) \ 2071 RND_RORX_1_7(f,g,h,a,b,c,d,e,i+3) \ 2072 RND_RORX_1_8(f,g,h,a,b,c,d,e,i+3) \ 2073 VMOVDQU_I(rsp, i, YTMP1) \ 2074 2075 #define MsgSched2_AVX2_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e, \ 2076 f,g,h,i) \ 2077 RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \ 2078 VPALIGNR(W_Y_M15, W_2, W_0, 8) \ 2079 VPALIGNR(W_Y_M7, W_10, W_8, 8) \ 2080 RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \ 2081 V_SHIFT_R(YTMP1, W_Y_M15, 1) \ 2082 V_SHIFT_L(YTMP2, W_Y_M15, 63) \ 2083 RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \ 2084 V_SHIFT_R(YTMP3, W_Y_M15, 8) \ 2085 V_SHIFT_L(YTMP4, W_Y_M15, 56) \ 2086 RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \ 2087 V_OR(YTMP1, YTMP2, YTMP1) \ 2088 V_OR(YTMP3, YTMP4, YTMP3) \ 2089 RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \ 2090 V_SHIFT_R(YTMP4, W_Y_M15, 7) \ 2091 V_XOR(YTMP1, YTMP3, YTMP1) \ 2092 RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \ 2093 V_XOR(YTMP1, YTMP4, YTMP1) \ 2094 V_ADD(W_0, W_0, W_Y_M7) \ 2095 RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \ 2096 RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \ 2097 V_ADD(W_0, W_0, YTMP1) \ 2098 RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \ 2099 V_SHIFT_R(YTMP1, W_14, 19) \ 2100 V_SHIFT_L(YTMP2, W_14, 45) \ 2101 RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \ 2102 V_SHIFT_R(YTMP3, W_14, 61) \ 2103 V_SHIFT_L(YTMP4, W_14, 3) \ 2104 RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \ 2105 V_OR(YTMP1, YTMP2, YTMP1) \ 2106 V_OR(YTMP3, YTMP4, YTMP3) \ 2107 RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \ 2108 RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \ 2109 V_XOR(YTMP1, YTMP3, YTMP1) \ 2110 V_SHIFT_R(YTMP4, W_14, 6) \ 2111 RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \ 2112 RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \ 2113 V_XOR(YTMP1, YTMP4, YTMP1) \ 2114 RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \ 2115 V_ADD(W_0, W_0, YTMP1) \ 2116 2117 2118 #define _INIT_MASK_Y(mask) \ 2119 "vmovdqu %[mask], %%"#mask"\n\t" 2120 #define INIT_MASK_Y(mask) \ 2121 _INIT_MASK_Y(mask) 2122 2123 /* Load into YMM registers and swap endian. */ 2124 #define _LOAD_BLOCK_W_Y_2(mask, ymm0, ymm1, reg, i) \ 2125 /* buffer[0..15] => ymm0..ymm3; */ \ 2126 "vmovdqu " #i "+ 0(%%" #reg "), %%" #ymm0 "\n\t" \ 2127 "vmovdqu " #i "+32(%%" #reg "), %%" #ymm1 "\n\t" \ 2128 "vpshufb %%" #mask ", %%" #ymm0 ", %%" #ymm0 "\n\t" \ 2129 "vpshufb %%" #mask ", %%" #ymm1 ", %%" #ymm1 "\n\t" 2130 2131 #define LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i) \ 2132 _LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i) 2133 2134 #define LOAD_BLOCK_W_Y(mask, reg) \ 2135 LOAD_BLOCK_W_Y_2(mask, W_Y_0, W_Y_4 , reg, 0) \ 2136 LOAD_BLOCK_W_Y_2(mask, W_Y_8, W_Y_12, reg, 64) 2137 2138 #define _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) \ 2139 "vpaddq " #i "+ 0(%%" #reg "), %%" #ymm0 ", %%" #ymm2 "\n\t" \ 2140 "vpaddq " #i "+32(%%" #reg "), %%" #ymm1 ", %%" #ymm3 "\n\t" \ 2141 "vmovdqu %%" #ymm2 ", " #i "+ 0(" WX ")\n\t" \ 2142 "vmovdqu %%" #ymm3 ", " #i "+32(" WX ")\n\t" 2143 2144 #define SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) \ 2145 _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) 2146 2147 #define SET_BLOCK_W_Y(reg) \ 2148 SET_W_Y_2(W_Y_0, W_Y_4 , YTMP1, YTMP2, reg, 0) \ 2149 SET_W_Y_2(W_Y_8, W_Y_12, YTMP1, YTMP2, reg, 64) 2150 2151 /* Load into YMM registers and swap endian. */ 2152 #define _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) \ 2153 "vmovdqu " #i "+ 0(%%" #reg "), %%" #X0 "\n\t" \ 2154 "vmovdqu " #i "+ 16(%%" #reg "), %%" #X1 "\n\t" \ 2155 "vmovdqu " #i "+128(%%" #reg "), %%" #X8 "\n\t" \ 2156 "vmovdqu " #i "+144(%%" #reg "), %%" #X9 "\n\t" \ 2157 "vinserti128 $1, %%" #X8 ", %%" #Y0 ", %%" #Y0 "\n\t" \ 2158 "vinserti128 $1, %%" #X9 ", %%" #Y1 ", %%" #Y1 "\n\t" \ 2159 "vpshufb %%" #mask ", %%" #Y0 ", %%" #Y0 "\n\t" \ 2160 "vpshufb %%" #mask ", %%" #Y1 ", %%" #Y1 "\n\t" 2161 2162 #define LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) \ 2163 _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) 2164 2165 #define LOAD_BLOCK2_W_Y(mask, reg) \ 2166 LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, 0) \ 2167 LOAD_BLOCK2_W_Y_2(mask, Y2, Y3, X2, X3, X8, X9, reg, 32) \ 2168 LOAD_BLOCK2_W_Y_2(mask, Y4, Y5, X4, X5, X8, X9, reg, 64) \ 2169 LOAD_BLOCK2_W_Y_2(mask, Y6, Y7, X6, X7, X8, X9, reg, 96) \ 2170 2171 #define SET_BLOCK2_W_Y(reg) \ 2172 SET_W_Y_2(Y0, Y1, YTMP1, YTMP2, reg, 0) \ 2173 SET_W_Y_2(Y2, Y3, YTMP1, YTMP2, reg, 64) \ 2174 SET_W_Y_2(Y4, Y5, YTMP1, YTMP2, reg, 128) \ 2175 SET_W_Y_2(Y6, Y7, YTMP1, YTMP2, reg, 192) 2176 2177 static const word64 K512_AVX2[160] = { 2178 W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), 2179 W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), 2180 W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), 2181 W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), 2182 W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), 2183 W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), 2184 W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118), 2185 W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118), 2186 W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe), 2187 W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe), 2188 W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2), 2189 W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2), 2190 W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1), 2191 W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1), 2192 W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694), 2193 W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694), 2194 W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3), 2195 W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3), 2196 W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65), 2197 W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65), 2198 W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483), 2199 W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483), 2200 W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5), 2201 W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5), 2202 W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210), 2203 W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210), 2204 W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4), 2205 W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4), 2206 W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725), 2207 W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725), 2208 W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70), 2209 W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70), 2210 W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926), 2211 W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926), 2212 W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df), 2213 W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df), 2214 W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8), 2215 W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8), 2216 W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b), 2217 W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b), 2218 W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001), 2219 W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001), 2220 W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30), 2221 W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30), 2222 W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910), 2223 W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910), 2224 W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8), 2225 W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8), 2226 W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53), 2227 W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53), 2228 W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8), 2229 W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8), 2230 W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb), 2231 W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb), 2232 W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3), 2233 W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3), 2234 W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60), 2235 W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60), 2236 W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec), 2237 W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec), 2238 W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9), 2239 W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9), 2240 W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b), 2241 W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b), 2242 W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207), 2243 W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207), 2244 W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178), 2245 W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178), 2246 W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6), 2247 W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6), 2248 W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b), 2249 W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b), 2250 W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493), 2251 W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493), 2252 W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c), 2253 W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c), 2254 W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a), 2255 W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a), 2256 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817), 2257 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817) 2258 }; 2259 static const word64* K512_AVX2_END = &K512_AVX2[128]; 2260 2261 static int Transform_Sha512_AVX2(wc_Sha512* sha512) 1273 2262 { 1274 const word64* K = K512; 1275 word64 w[4]; 1276 word32 j; 1277 word64 T[8]; 1278 1279 /* Copy digest to working vars */ 1280 XMEMCPY(T, sha512->digest, sizeof(T)); 1281 1282 W_from_buff_Y(sha512->buffer); 1283 MOVE_to_MEMy(w,0, W_0y); 1284 for (j = 0; j < 80; j += 16) { 1285 Ry_1( 0, w[0]); Block_Y_0_1(); Ry_2( 0, w[0]); Block_Y_0_2(); 1286 Ry_3( 0, w[0]); Block_Y_0_3(); 1287 Ry_1( 1, w[1]); Block_Y_0_4(); Ry_2( 1, w[1]); Block_Y_0_5(); 1288 Ry_3( 1, w[1]); Block_Y_0_6(); 1289 Ry_1( 2, w[2]); Block_Y_0_7(); Ry_2( 2, w[2]); Block_Y_0_8(); 1290 Ry_3( 2, w[2]); Block_Y_0_9(); 1291 Ry_1( 3, w[3]); Block_Y_0_10();Ry_2( 3, w[3]); Block_Y_0_11(); 1292 Ry_3( 3, w[3]); Block_Y_0_12(w); 1293 1294 Ry_1( 4, w[0]); Block_Y_4_1(); Ry_2( 4, w[0]); Block_Y_4_2(); 1295 Ry_3( 4, w[0]); Block_Y_4_3(); 1296 Ry_1( 5, w[1]); Block_Y_4_4(); Ry_2( 5, w[1]); Block_Y_4_5(); 1297 Ry_3( 5, w[1]); Block_Y_4_6(); 1298 Ry_1( 6, w[2]); Block_Y_4_7(); Ry_2( 6, w[2]); Block_Y_4_8(); 1299 Ry_3( 6, w[2]); Block_Y_4_9(); 1300 Ry_1( 7, w[3]); Block_Y_4_10(); Ry_2( 7, w[3]);Block_Y_4_11(); 1301 Ry_3( 7, w[3]);Block_Y_4_12(w); 1302 1303 Ry_1( 8, w[0]); Block_Y_8_1(); Ry_2( 8, w[0]); Block_Y_8_2(); 1304 Ry_3( 8, w[0]); Block_Y_8_3(); 1305 Ry_1( 9, w[1]); Block_Y_8_4(); Ry_2( 9, w[1]); Block_Y_8_5(); 1306 Ry_3( 9, w[1]); Block_Y_8_6(); 1307 Ry_1(10, w[2]); Block_Y_8_7(); Ry_2(10, w[2]); Block_Y_8_8(); 1308 Ry_3(10, w[2]); Block_Y_8_9(); 1309 Ry_1(11, w[3]); Block_Y_8_10();Ry_2(11, w[3]); Block_Y_8_11(); 1310 Ry_3(11, w[3]); Block_Y_8_12(w); 1311 1312 Ry_1(12, w[0]); Block_Y_12_1(); Ry_2(12, w[0]); Block_Y_12_2(); 1313 Ry_3(12, w[0]); Block_Y_12_3(); 1314 Ry_1(13, w[1]); Block_Y_12_4(); Ry_2(13, w[1]); Block_Y_12_5(); 1315 Ry_3(13, w[1]); Block_Y_12_6(); 1316 Ry_1(14, w[2]); Block_Y_12_7(); Ry_2(14, w[2]); Block_Y_12_8(); 1317 Ry_3(14, w[2]); Block_Y_12_9(); 1318 Ry_1(15, w[3]); Block_Y_12_10();Ry_2(15, w[3]); Block_Y_12_11(); 1319 Ry_3(15, w[3]);Block_Y_12_12(w); 1320 } 1321 1322 /* Add the working vars back into digest */ 1323 sha512->digest[0] += a(0); 1324 sha512->digest[1] += b(0); 1325 sha512->digest[2] += c(0); 1326 sha512->digest[3] += d(0); 1327 sha512->digest[4] += e(0); 1328 sha512->digest[5] += f(0); 1329 sha512->digest[6] += g(0); 1330 sha512->digest[7] += h(0); 1331 1332 /* Wipe variables */ 1333 #if !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2) 1334 XMEMSET(W, 0, sizeof(word64) * 16); 1335 #endif 1336 XMEMSET(T, 0, sizeof(T)); 2263 __asm__ __volatile__ ( 2264 2265 /* 16 Ws plus loop counter and K512. */ 2266 "subq $136, %%rsp\n\t" 2267 "leaq 64(%[sha512]), %%rax\n\t" 2268 2269 INIT_MASK(MASK_Y) 2270 LOAD_DIGEST() 2271 2272 LOAD_BLOCK_W_Y(MASK_Y, rax) 2273 2274 "movl $4, 16*8(" WX ")\n\t" 2275 "leaq %[K512], %%rsi\n\t" 2276 /* b */ 2277 "movq %%r9, " L4 "\n\t" 2278 /* e */ 2279 "movq %%r12, " L1 "\n\t" 2280 /* b ^ c */ 2281 "xorq %%r10, " L4 "\n\t" 2282 2283 SET_BLOCK_W_Y(rsi) 2284 2285 "# Start of 16 rounds\n" 2286 "1:\n\t" 2287 2288 "addq $128, %%rsi\n\t" 2289 2290 MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0) 2291 MsgSched4_AVX2(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4) 2292 MsgSched4_AVX2(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8) 2293 MsgSched4_AVX2(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12) 2294 2295 SET_BLOCK_W_Y(rsi) 2296 2297 "subl $1, 16*8(" WX ")\n\t" 2298 "jne 1b\n\t" 2299 2300 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 2301 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) 2302 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) 2303 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) 2304 2305 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) 2306 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) 2307 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) 2308 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 2309 2310 STORE_ADD_DIGEST() 2311 2312 "addq $136, %%rsp\n\t" 2313 2314 : 2315 : [mask] "m" (mBYTE_FLIP_MASK_Y), 2316 [sha512] "r" (sha512), 2317 [K512] "m" (K512) 2318 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi" 2319 ); 2320 2321 return 0; 2322 } 2323 2324 static int Transform_Sha512_AVX2_Len(wc_Sha512* sha512, word32 len) 2325 { 2326 if ((len & WC_SHA512_BLOCK_SIZE) != 0) { 2327 XMEMCPY(sha512->buffer, sha512->data, WC_SHA512_BLOCK_SIZE); 2328 Transform_Sha512_AVX2(sha512); 2329 sha512->data += WC_SHA512_BLOCK_SIZE; 2330 len -= WC_SHA512_BLOCK_SIZE; 2331 if (len == 0) 2332 return 0; 2333 } 2334 2335 __asm__ __volatile__ ( 2336 2337 "movq 224(%[sha512]), %%rcx\n\t" 2338 2339 INIT_MASK(MASK_Y) 2340 LOAD_DIGEST() 2341 2342 "# Start of processing two blocks\n" 2343 "2:\n\t" 2344 2345 "subq $1344, %%rsp\n\t" 2346 "leaq %[K512], %%rsi\n\t" 2347 2348 /* L4 = b */ 2349 "movq %%r9, " L4 "\n\t" 2350 /* e */ 2351 "movq %%r12, " L1 "\n\t" 2352 2353 LOAD_BLOCK2_W_Y(MASK_Y, rcx) 2354 2355 /* L4 = b ^ c */ 2356 "xorq %%r10, " L4 "\n\t" 2357 "\n" 2358 "1:\n\t" 2359 SET_BLOCK2_W_Y(rsi) 2360 MsgSched2_AVX2(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0) 2361 MsgSched2_AVX2(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4) 2362 MsgSched2_AVX2(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8) 2363 MsgSched2_AVX2(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12) 2364 MsgSched2_AVX2(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16) 2365 MsgSched2_AVX2(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20) 2366 MsgSched2_AVX2(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24) 2367 MsgSched2_AVX2(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28) 2368 "addq $256, %%rsi\n\t" 2369 "addq $256, %%rsp\n\t" 2370 "cmpq %[K512_END], %%rsi\n\t" 2371 "jne 1b\n\t" 2372 2373 SET_BLOCK2_W_Y(rsi) 2374 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 2375 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4) 2376 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8) 2377 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12) 2378 2379 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16) 2380 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20) 2381 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24) 2382 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28) 2383 "subq $1024, %%rsp\n\t" 2384 2385 ADD_DIGEST() 2386 STORE_DIGEST() 2387 2388 /* L4 = b */ 2389 "movq %%r9, " L4 "\n\t" 2390 /* e */ 2391 "movq %%r12, " L1 "\n\t" 2392 /* L4 = b ^ c */ 2393 "xorq %%r10, " L4 "\n\t" 2394 2395 "movq $5, %%rsi\n\t" 2396 "\n" 2397 "3:\n\t" 2398 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2) 2399 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6) 2400 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10) 2401 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 2402 2403 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18) 2404 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22) 2405 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26) 2406 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30) 2407 "addq $256, %%rsp\n\t" 2408 "subq $1, %%rsi\n\t" 2409 "jnz 3b\n\t" 2410 2411 ADD_DIGEST() 2412 2413 "movq 224(%[sha512]), %%rcx\n\t" 2414 "addq $64, %%rsp\n\t" 2415 "addq $256, %%rcx\n\t" 2416 "subl $256, %[len]\n\t" 2417 "movq %%rcx, 224(%[sha512])\n\t" 2418 2419 STORE_DIGEST() 2420 2421 "jnz 2b\n\t" 2422 2423 : 2424 : [mask] "m" (mBYTE_FLIP_MASK_Y), 2425 [len] "m" (len), 2426 [sha512] "r" (sha512), 2427 [K512] "m" (K512_AVX2), 2428 [K512_END] "m" (K512_AVX2_END) 2429 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi" 2430 ); 1337 2431 1338 2432 return 0; 1339 2433 } 2434 2435 #ifdef HAVE_INTEL_RORX 2436 static int Transform_Sha512_AVX2_RORX(wc_Sha512* sha512) 2437 { 2438 __asm__ __volatile__ ( 2439 2440 /* 16 Ws plus loop counter. */ 2441 "subq $136, %%rsp\n\t" 2442 "leaq 64(%[sha512]), " L2 "\n\t" 2443 2444 INIT_MASK(MASK_Y) 2445 LOAD_DIGEST() 2446 2447 LOAD_BLOCK_W_Y(MASK_Y, rcx) 2448 2449 "movl $4, 16*8(" WX ")\n\t" 2450 "leaq %[K512], %%rsi\n\t" 2451 /* b */ 2452 "movq %%r9, " L4 "\n\t" 2453 /* L3 = 0 (add to prev h) */ 2454 "xorq " L3 ", " L3 "\n\t" 2455 /* b ^ c */ 2456 "xorq %%r10, " L4 "\n\t" 2457 2458 SET_BLOCK_W_Y(rsi) 2459 2460 "# Start of 16 rounds\n" 2461 "1:\n\t" 2462 2463 "addq $128, %%rsi\n\t" 2464 2465 MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0) 2466 MsgSched4_AVX2_RORX_SET(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4) 2467 MsgSched4_AVX2_RORX_SET(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8) 2468 MsgSched4_AVX2_RORX_SET(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12) 2469 2470 "subl $1, 16*8(%%rsp)\n\t" 2471 "jnz 1b\n\t" 2472 2473 RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 0) 2474 RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD, 4) 2475 RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 8) 2476 RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD,12) 2477 /* Prev RND: h += Maj(a,b,c) */ 2478 "addq " L3 ", %%r8\n\t" 2479 "addq $136, %%rsp\n\t" 2480 2481 STORE_ADD_DIGEST() 2482 2483 : 2484 : [mask] "m" (mBYTE_FLIP_MASK_Y), 2485 [sha512] "r" (sha512), 2486 [K512] "m" (K512) 2487 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi" 2488 ); 2489 2490 return 0; 2491 } 2492 2493 static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512* sha512, word32 len) 2494 { 2495 if ((len & WC_SHA512_BLOCK_SIZE) != 0) { 2496 XMEMCPY(sha512->buffer, sha512->data, WC_SHA512_BLOCK_SIZE); 2497 Transform_Sha512_AVX2_RORX(sha512); 2498 sha512->data += WC_SHA512_BLOCK_SIZE; 2499 len -= WC_SHA512_BLOCK_SIZE; 2500 if (len == 0) 2501 return 0; 2502 } 2503 2504 __asm__ __volatile__ ( 2505 2506 "movq 224(%[sha512]), %%rax\n\t" 2507 2508 INIT_MASK(MASK_Y) 2509 LOAD_DIGEST() 2510 2511 "# Start of processing two blocks\n" 2512 "2:\n\t" 2513 2514 "subq $1344, %%rsp\n\t" 2515 "leaq %[K512], %%rsi\n\t" 2516 2517 /* L4 = b */ 2518 "movq %%r9, " L4 "\n\t" 2519 /* L3 = 0 (add to prev h) */ 2520 "xorq " L3 ", " L3 "\n\t" 2521 2522 LOAD_BLOCK2_W_Y(MASK_Y, rax) 2523 2524 /* L4 = b ^ c */ 2525 "xorq %%r10, " L4 "\n\t" 2526 "\n" 2527 "1:\n\t" 2528 SET_BLOCK2_W_Y(rsi) 2529 MsgSched2_AVX2_RORX(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0) 2530 MsgSched2_AVX2_RORX(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4) 2531 MsgSched2_AVX2_RORX(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8) 2532 MsgSched2_AVX2_RORX(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12) 2533 MsgSched2_AVX2_RORX(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16) 2534 MsgSched2_AVX2_RORX(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20) 2535 MsgSched2_AVX2_RORX(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24) 2536 MsgSched2_AVX2_RORX(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28) 2537 "addq $256, %%rsi\n\t" 2538 "addq $256, %%rsp\n\t" 2539 "cmpq %[K512_END], %%rsi\n\t" 2540 "jne 1b\n\t" 2541 2542 SET_BLOCK2_W_Y(rsi) 2543 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 2544 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4) 2545 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8) 2546 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12) 2547 2548 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16) 2549 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20) 2550 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24) 2551 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28) 2552 "addq " L3 ", %%r8\n\t" 2553 "subq $1024, %%rsp\n\t" 2554 2555 ADD_DIGEST() 2556 STORE_DIGEST() 2557 2558 /* L4 = b */ 2559 "movq %%r9, " L4 "\n\t" 2560 /* L3 = 0 (add to prev h) */ 2561 "xorq " L3 ", " L3 "\n\t" 2562 /* L4 = b ^ c */ 2563 "xorq %%r10, " L4 "\n\t" 2564 2565 "movq $5, %%rsi\n\t" 2566 "\n" 2567 "3:\n\t" 2568 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2) 2569 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6) 2570 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10) 2571 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 2572 2573 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18) 2574 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22) 2575 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26) 2576 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30) 2577 "addq $256, %%rsp\n\t" 2578 "subq $1, %%rsi\n\t" 2579 "jnz 3b\n\t" 2580 2581 "addq " L3 ", %%r8\n\t" 2582 2583 ADD_DIGEST() 2584 2585 "movq 224(%[sha512]), %%rax\n\t" 2586 "addq $64, %%rsp\n\t" 2587 "addq $256, %%rax\n\t" 2588 "subl $256, %[len]\n\t" 2589 "movq %%rax, 224(%[sha512])\n\t" 2590 2591 STORE_DIGEST() 2592 2593 "jnz 2b\n\t" 2594 2595 : 2596 : [mask] "m" (mBYTE_FLIP_MASK_Y), 2597 [len] "m" (len), 2598 [sha512] "r" (sha512), 2599 [K512] "m" (K512_AVX2), 2600 [K512_END] "m" (K512_AVX2_END) 2601 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi" 2602 ); 2603 2604 return 0; 2605 } 2606 #endif /* HAVE_INTEL_RORX */ 1340 2607 #endif /* HAVE_INTEL_AVX2 */ 1341 2608 2609 #endif /* WOLFSSL_SHA512 */ 1342 2610 1343 2611 … … 1346 2614 /* -------------------------------------------------------------------------- */ 1347 2615 #ifdef WOLFSSL_SHA384 2616 2617 #if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH) 2618 /* functions defined in wolfcrypt/src/port/caam/caam_sha.c */ 2619 #else 2620 1348 2621 static int InitSha384(wc_Sha384* sha384) 1349 2622 { … … 1386 2659 1387 2660 2661 int wc_Sha384FinalRaw(wc_Sha384* sha384, byte* hash) 2662 { 2663 #ifdef LITTLE_ENDIAN_ORDER 2664 word64 digest[WC_SHA384_DIGEST_SIZE / sizeof(word64)]; 2665 #endif 2666 2667 if (sha384 == NULL || hash == NULL) { 2668 return BAD_FUNC_ARG; 2669 } 2670 2671 #ifdef LITTLE_ENDIAN_ORDER 2672 ByteReverseWords64((word64*)digest, (word64*)sha384->digest, 2673 WC_SHA384_DIGEST_SIZE); 2674 XMEMCPY(hash, digest, WC_SHA384_DIGEST_SIZE); 2675 #else 2676 XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE); 2677 #endif 2678 2679 return 0; 2680 } 2681 1388 2682 int wc_Sha384Final(wc_Sha384* sha384, byte* hash) 1389 2683 { … … 1412 2706 } 1413 2707 1414 1415 /* Hardware Acceleration */1416 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)1417 int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId)1418 {1419 int ret = InitSha384(sha384);1420 1421 (void)heap;1422 (void)devId;1423 1424 Sha512_SetTransform();1425 1426 return ret;1427 }1428 #else1429 2708 int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId) 1430 2709 { … … 1439 2718 if (ret != 0) 1440 2719 return ret; 2720 2721 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 2722 Sha512_SetTransform(); 2723 #endif 2724 #ifdef WOLFSSL_SMALL_STACK_CACHE 2725 sha384->W = NULL; 2726 #endif 1441 2727 1442 2728 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384) … … 1449 2735 return ret; 1450 2736 } 1451 #endif 2737 2738 #endif /* WOLFSSL_IMX6_CAAM */ 1452 2739 1453 2740 int wc_InitSha384(wc_Sha384* sha384) … … 1461 2748 return; 1462 2749 2750 #ifdef WOLFSSL_SMALL_STACK_CACHE 2751 if (sha384->W != NULL) { 2752 XFREE(sha384->W, NULL, DYNAMIC_TYPE_TMP_BUFFER); 2753 sha384->W = NULL; 2754 } 2755 #endif 2756 1463 2757 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384) 1464 2758 wolfAsync_DevCtxFree(&sha384->asyncDev, WOLFSSL_ASYNC_MARKER_SHA384); … … 1470 2764 #endif /* HAVE_FIPS */ 1471 2765 2766 #ifdef WOLFSSL_SHA512 1472 2767 1473 2768 int wc_Sha512GetHash(wc_Sha512* sha512, byte* hash) … … 1482 2777 if (ret == 0) { 1483 2778 ret = wc_Sha512Final(&tmpSha512, hash); 2779 wc_Sha512Free(&tmpSha512); 1484 2780 } 1485 2781 return ret; … … 1494 2790 1495 2791 XMEMCPY(dst, src, sizeof(wc_Sha512)); 2792 #ifdef WOLFSSL_SMALL_STACK_CACHE 2793 dst->W = NULL; 2794 #endif 1496 2795 1497 2796 #ifdef WOLFSSL_ASYNC_CRYPT … … 1502 2801 } 1503 2802 2803 #endif /* WOLFSSL_SHA512 */ 2804 1504 2805 #ifdef WOLFSSL_SHA384 2806 1505 2807 int wc_Sha384GetHash(wc_Sha384* sha384, byte* hash) 1506 2808 { … … 1514 2816 if (ret == 0) { 1515 2817 ret = wc_Sha384Final(&tmpSha384, hash); 2818 wc_Sha384Free(&tmpSha384); 1516 2819 } 1517 2820 return ret; … … 1525 2828 1526 2829 XMEMCPY(dst, src, sizeof(wc_Sha384)); 2830 #ifdef WOLFSSL_SMALL_STACK_CACHE 2831 dst->W = NULL; 2832 #endif 1527 2833 1528 2834 #ifdef WOLFSSL_ASYNC_CRYPT … … 1532 2838 return ret; 1533 2839 } 2840 1534 2841 #endif /* WOLFSSL_SHA384 */ 1535 2842 1536 #endif /* WOLFSSL_SHA512 */2843 #endif /* WOLFSSL_SHA512 || WOLFSSL_SHA384 */
Note:
See TracChangeset
for help on using the changeset viewer.