source: azure_iot_hub/trunk/wolfssl-3.15.7/wolfcrypt/src/sha512.c@ 389

Last change on this file since 389 was 389, checked in by coas-nagasima, 5 years ago

ビルドが通るよう更新

  • Property svn:eol-style set to native
  • Property svn:mime-type set to text/x-csrc;charset=UTF-8
File size: 106.1 KB
Line 
1/* sha512.c
2 *
3 * Copyright (C) 2006-2017 wolfSSL Inc.
4 *
5 * This file is part of wolfSSL.
6 *
7 * wolfSSL is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * wolfSSL is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
20 */
21
22
23#ifdef HAVE_CONFIG_H
24 #include <config.h>
25#endif
26
27#include <wolfssl/wolfcrypt/settings.h>
28
29#if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
30
31#if defined(HAVE_FIPS) && \
32 defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2)
33
34 /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
35 #define FIPS_NO_WRAPPERS
36
37 #ifdef USE_WINDOWS_API
38 #pragma code_seg(".fipsA$k")
39 #pragma const_seg(".fipsB$k")
40 #endif
41#endif
42
43#include <wolfssl/wolfcrypt/sha512.h>
44#include <wolfssl/wolfcrypt/error-crypt.h>
45#include <wolfssl/wolfcrypt/cpuid.h>
46
47/* deprecated USE_SLOW_SHA2 (replaced with USE_SLOW_SHA512) */
48#if defined(USE_SLOW_SHA2) && !defined(USE_SLOW_SHA512)
49 #define USE_SLOW_SHA512
50#endif
51
52/* fips wrapper calls, user can call direct */
53#if defined(HAVE_FIPS) && \
54 (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION < 2))
55
56 #ifdef WOLFSSL_SHA512
57
58 int wc_InitSha512(wc_Sha512* sha)
59 {
60 if (sha == NULL) {
61 return BAD_FUNC_ARG;
62 }
63
64 return InitSha512_fips(sha);
65 }
66 int wc_InitSha512_ex(wc_Sha512* sha, void* heap, int devId)
67 {
68 (void)heap;
69 (void)devId;
70 if (sha == NULL) {
71 return BAD_FUNC_ARG;
72 }
73 return InitSha512_fips(sha);
74 }
75 int wc_Sha512Update(wc_Sha512* sha, const byte* data, word32 len)
76 {
77 if (sha == NULL || (data == NULL && len > 0)) {
78 return BAD_FUNC_ARG;
79 }
80
81 return Sha512Update_fips(sha, data, len);
82 }
83 int wc_Sha512Final(wc_Sha512* sha, byte* out)
84 {
85 if (sha == NULL || out == NULL) {
86 return BAD_FUNC_ARG;
87 }
88
89 return Sha512Final_fips(sha, out);
90 }
91 void wc_Sha512Free(wc_Sha512* sha)
92 {
93 (void)sha;
94 /* Not supported in FIPS */
95 }
96 #endif
97
98 #if defined(WOLFSSL_SHA384) || defined(HAVE_AESGCM)
99 int wc_InitSha384(wc_Sha384* sha)
100 {
101 if (sha == NULL) {
102 return BAD_FUNC_ARG;
103 }
104 return InitSha384_fips(sha);
105 }
106 int wc_InitSha384_ex(wc_Sha384* sha, void* heap, int devId)
107 {
108 (void)heap;
109 (void)devId;
110 if (sha == NULL) {
111 return BAD_FUNC_ARG;
112 }
113 return InitSha384_fips(sha);
114 }
115 int wc_Sha384Update(wc_Sha384* sha, const byte* data, word32 len)
116 {
117 if (sha == NULL || (data == NULL && len > 0)) {
118 return BAD_FUNC_ARG;
119 }
120 return Sha384Update_fips(sha, data, len);
121 }
122 int wc_Sha384Final(wc_Sha384* sha, byte* out)
123 {
124 if (sha == NULL || out == NULL) {
125 return BAD_FUNC_ARG;
126 }
127 return Sha384Final_fips(sha, out);
128 }
129 void wc_Sha384Free(wc_Sha384* sha)
130 {
131 (void)sha;
132 /* Not supported in FIPS */
133 }
134 #endif /* WOLFSSL_SHA384 || HAVE_AESGCM */
135
136#else /* else build without fips, or for FIPS v2 */
137
138#include <wolfssl/wolfcrypt/logging.h>
139
140#ifdef NO_INLINE
141 #include <wolfssl/wolfcrypt/misc.h>
142#else
143 #define WOLFSSL_MISC_INCLUDED
144 #include <wolfcrypt/src/misc.c>
145#endif
146
147
148#if defined(USE_INTEL_SPEEDUP)
149 #if defined(__GNUC__) && ((__GNUC__ < 4) || \
150 (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
151 #undef NO_AVX2_SUPPORT
152 #define NO_AVX2_SUPPORT
153 #endif
154 #if defined(__clang__) && ((__clang_major__ < 3) || \
155 (__clang_major__ == 3 && __clang_minor__ <= 5))
156 #define NO_AVX2_SUPPORT
157 #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
158 #undef NO_AVX2_SUPPORT
159 #endif
160
161 #define HAVE_INTEL_AVX1
162 #ifndef NO_AVX2_SUPPORT
163 #define HAVE_INTEL_AVX2
164#endif
165#endif
166
167#if defined(HAVE_INTEL_AVX1)
168 /* #define DEBUG_XMM */
169#endif
170
171#if defined(HAVE_INTEL_AVX2)
172 #define HAVE_INTEL_RORX
173 /* #define DEBUG_YMM */
174#endif
175
176#if defined(HAVE_BYTEREVERSE64) && \
177 !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2)
178 #define ByteReverseWords64(out, in, size) ByteReverseWords64_1(out, size)
179 #define ByteReverseWords64_1(buf, size) \
180 { unsigned int i ;\
181 for(i=0; i< size/sizeof(word64); i++){\
182 __asm__ volatile("bswapq %0":"+r"(buf[i])::) ;\
183 }\
184 }
185#endif
186
187#if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH)
188 /* functions defined in wolfcrypt/src/port/caam/caam_sha.c */
189#else
190
191#ifdef WOLFSSL_SHA512
192
193static int InitSha512(wc_Sha512* sha512)
194{
195 if (sha512 == NULL)
196 return BAD_FUNC_ARG;
197
198 sha512->digest[0] = W64LIT(0x6a09e667f3bcc908);
199 sha512->digest[1] = W64LIT(0xbb67ae8584caa73b);
200 sha512->digest[2] = W64LIT(0x3c6ef372fe94f82b);
201 sha512->digest[3] = W64LIT(0xa54ff53a5f1d36f1);
202 sha512->digest[4] = W64LIT(0x510e527fade682d1);
203 sha512->digest[5] = W64LIT(0x9b05688c2b3e6c1f);
204 sha512->digest[6] = W64LIT(0x1f83d9abfb41bd6b);
205 sha512->digest[7] = W64LIT(0x5be0cd19137e2179);
206
207 sha512->buffLen = 0;
208 sha512->loLen = 0;
209 sha512->hiLen = 0;
210
211 return 0;
212}
213
214#endif /* WOLFSSL_SHA512 */
215
216/* Hardware Acceleration */
217#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
218
219#ifdef WOLFSSL_SHA512
220
221 /*****
222 Intel AVX1/AVX2 Macro Control Structure
223
224 #if defined(HAVE_INteL_SPEEDUP)
225 #define HAVE_INTEL_AVX1
226 #define HAVE_INTEL_AVX2
227 #endif
228
229 int InitSha512(wc_Sha512* sha512) {
230 Save/Recover XMM, YMM
231 ...
232
233 Check Intel AVX cpuid flags
234 }
235
236 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
237 Transform_Sha512_AVX1(); # Function prototype
238 Transform_Sha512_AVX2(); #
239 #endif
240
241 _Transform_Sha512() { # Native Transform Function body
242
243 }
244
245 int Sha512Update() {
246 Save/Recover XMM, YMM
247 ...
248 }
249
250 int Sha512Final() {
251 Save/Recover XMM, YMM
252 ...
253 }
254
255
256 #if defined(HAVE_INTEL_AVX1)
257
258 XMM Instructions/INLINE asm Definitions
259
260 #endif
261
262 #if defined(HAVE_INTEL_AVX2)
263
264 YMM Instructions/INLINE asm Definitions
265
266 #endif
267
268 #if defnied(HAVE_INTEL_AVX1)
269
270 int Transform_Sha512_AVX1() {
271 Stitched Message Sched/Round
272 }
273
274 #endif
275
276 #if defnied(HAVE_INTEL_AVX2)
277
278 int Transform_Sha512_AVX2() {
279 Stitched Message Sched/Round
280 }
281 #endif
282
283 */
284
285
286 /* Each platform needs to query info type 1 from cpuid to see if aesni is
287 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
288 */
289
290 #if defined(HAVE_INTEL_AVX1)
291 static int Transform_Sha512_AVX1(wc_Sha512 *sha512);
292 static int Transform_Sha512_AVX1_Len(wc_Sha512 *sha512, word32 len);
293 #endif
294 #if defined(HAVE_INTEL_AVX2)
295 static int Transform_Sha512_AVX2(wc_Sha512 *sha512);
296 static int Transform_Sha512_AVX2_Len(wc_Sha512 *sha512, word32 len);
297 #if defined(HAVE_INTEL_RORX)
298 static int Transform_Sha512_AVX1_RORX(wc_Sha512 *sha512);
299 static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512 *sha512,
300 word32 len);
301 static int Transform_Sha512_AVX2_RORX(wc_Sha512 *sha512);
302 static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512 *sha512,
303 word32 len);
304 #endif
305 #endif
306 static int _Transform_Sha512(wc_Sha512 *sha512);
307 static int (*Transform_Sha512_p)(wc_Sha512* sha512) = _Transform_Sha512;
308 static int (*Transform_Sha512_Len_p)(wc_Sha512* sha512, word32 len) = NULL;
309 static int transform_check = 0;
310 static int intel_flags;
311 #define Transform_Sha512(sha512) (*Transform_Sha512_p)(sha512)
312 #define Transform_Sha512_Len(sha512, len) \
313 (*Transform_Sha512_Len_p)(sha512, len)
314
315 static void Sha512_SetTransform()
316 {
317 if (transform_check)
318 return;
319
320 intel_flags = cpuid_get_flags();
321
322 #if defined(HAVE_INTEL_AVX2)
323 if (IS_INTEL_AVX2(intel_flags)) {
324 #ifdef HAVE_INTEL_RORX
325 if (IS_INTEL_BMI2(intel_flags)) {
326 Transform_Sha512_p = Transform_Sha512_AVX2_RORX;
327 Transform_Sha512_Len_p = Transform_Sha512_AVX2_RORX_Len;
328 }
329 else
330 #endif
331 if (1) {
332 Transform_Sha512_p = Transform_Sha512_AVX2;
333 Transform_Sha512_Len_p = Transform_Sha512_AVX2_Len;
334 }
335 #ifdef HAVE_INTEL_RORX
336 else {
337 Transform_Sha512_p = Transform_Sha512_AVX1_RORX;
338 Transform_Sha512_Len_p = Transform_Sha512_AVX1_RORX_Len;
339 }
340 #endif
341 }
342 else
343 #endif
344 #if defined(HAVE_INTEL_AVX1)
345 if (IS_INTEL_AVX1(intel_flags)) {
346 Transform_Sha512_p = Transform_Sha512_AVX1;
347 Transform_Sha512_Len_p = Transform_Sha512_AVX1_Len;
348 }
349 else
350 #endif
351 Transform_Sha512_p = _Transform_Sha512;
352
353 transform_check = 1;
354 }
355#endif /* WOLFSSL_SHA512 */
356
357#else
358 #define Transform_Sha512(sha512) _Transform_Sha512(sha512)
359
360#endif
361
362#ifdef WOLFSSL_SHA512
363
364 int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId)
365 {
366 int ret = 0;
367
368 if (sha512 == NULL)
369 return BAD_FUNC_ARG;
370
371 sha512->heap = heap;
372
373 ret = InitSha512(sha512);
374 if (ret != 0)
375 return ret;
376
377#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
378 Sha512_SetTransform();
379#endif
380
381#ifdef WOLFSSL_SMALL_STACK_CACHE
382 sha512->W = NULL;
383#endif
384
385 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
386 ret = wolfAsync_DevCtxInit(&sha512->asyncDev,
387 WOLFSSL_ASYNC_MARKER_SHA512, sha512->heap, devId);
388 #else
389 (void)devId;
390 #endif /* WOLFSSL_ASYNC_CRYPT */
391
392 return ret;
393 }
394
395#endif /* WOLFSSL_SHA512 */
396
397
398static const word64 K512[80] = {
399 W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
400 W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
401 W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
402 W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
403 W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
404 W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
405 W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
406 W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
407 W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
408 W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
409 W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
410 W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
411 W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
412 W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
413 W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
414 W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
415 W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
416 W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
417 W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
418 W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
419 W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
420 W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
421 W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
422 W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
423 W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
424 W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
425 W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
426 W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
427 W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
428 W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
429 W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
430 W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
431 W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
432 W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
433 W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
434 W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
435 W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
436 W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
437 W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
438 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
439};
440
441#define blk0(i) (W[i] = sha512->buffer[i])
442
443#define blk2(i) (\
444 W[ i & 15] += \
445 s1(W[(i-2) & 15])+ \
446 W[(i-7) & 15] + \
447 s0(W[(i-15) & 15]) \
448 )
449
450#define Ch(x,y,z) (z^(x&(y^z)))
451#define Maj(x,y,z) ((x&y)|(z&(x|y)))
452
453#define a(i) T[(0-i)&7]
454#define b(i) T[(1-i)&7]
455#define c(i) T[(2-i)&7]
456#define d(i) T[(3-i)&7]
457#define e(i) T[(4-i)&7]
458#define f(i) T[(5-i)&7]
459#define g(i) T[(6-i)&7]
460#define h(i) T[(7-i)&7]
461
462#define S0(x) (rotrFixed64(x,28)^rotrFixed64(x,34)^rotrFixed64(x,39))
463#define S1(x) (rotrFixed64(x,14)^rotrFixed64(x,18)^rotrFixed64(x,41))
464#define s0(x) (rotrFixed64(x,1)^rotrFixed64(x,8)^(x>>7))
465#define s1(x) (rotrFixed64(x,19)^rotrFixed64(x,61)^(x>>6))
466
467#define R(i) \
468 h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j ? blk2(i) : blk0(i)); \
469 d(i) += h(i); \
470 h(i) += S0(a(i)) + Maj(a(i),b(i),c(i))
471
472static int _Transform_Sha512(wc_Sha512* sha512)
473{
474 const word64* K = K512;
475 word32 j;
476 word64 T[8];
477
478#ifdef WOLFSSL_SMALL_STACK_CACHE
479 word64* W = sha512->W;
480 if (W == NULL) {
481 W = (word64*) XMALLOC(sizeof(word64) * 16, NULL,
482 DYNAMIC_TYPE_TMP_BUFFER);
483 if (W == NULL)
484 return MEMORY_E;
485 sha512->W = W;
486 }
487#elif defined(WOLFSSL_SMALL_STACK)
488 word64* W;
489 W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER);
490 if (W == NULL)
491 return MEMORY_E;
492#else
493 word64 W[16];
494#endif
495
496 /* Copy digest to working vars */
497 XMEMCPY(T, sha512->digest, sizeof(T));
498
499#ifdef USE_SLOW_SHA512
500 /* over twice as small, but 50% slower */
501 /* 80 operations, not unrolled */
502 for (j = 0; j < 80; j += 16) {
503 int m;
504 for (m = 0; m < 16; m++) { /* braces needed here for macros {} */
505 R(m);
506 }
507 }
508#else
509 /* 80 operations, partially loop unrolled */
510 for (j = 0; j < 80; j += 16) {
511 R( 0); R( 1); R( 2); R( 3);
512 R( 4); R( 5); R( 6); R( 7);
513 R( 8); R( 9); R(10); R(11);
514 R(12); R(13); R(14); R(15);
515 }
516#endif /* USE_SLOW_SHA512 */
517
518 /* Add the working vars back into digest */
519 sha512->digest[0] += a(0);
520 sha512->digest[1] += b(0);
521 sha512->digest[2] += c(0);
522 sha512->digest[3] += d(0);
523 sha512->digest[4] += e(0);
524 sha512->digest[5] += f(0);
525 sha512->digest[6] += g(0);
526 sha512->digest[7] += h(0);
527
528 /* Wipe variables */
529 ForceZero(W, sizeof(word64) * 16);
530 ForceZero(T, sizeof(T));
531
532#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE)
533 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
534#endif
535
536 return 0;
537}
538
539
540static WC_INLINE void AddLength(wc_Sha512* sha512, word32 len)
541{
542 word64 tmp = sha512->loLen;
543 if ( (sha512->loLen += len) < tmp)
544 sha512->hiLen++; /* carry low to high */
545}
546
547static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 len)
548{
549 int ret = 0;
550 /* do block size increments */
551 byte* local = (byte*)sha512->buffer;
552
553 /* check that internal buffLen is valid */
554 if (sha512->buffLen >= WC_SHA512_BLOCK_SIZE)
555 return BUFFER_E;
556
557 if (sha512->buffLen > 0) {
558 word32 add = min(len, WC_SHA512_BLOCK_SIZE - sha512->buffLen);
559 if (add > 0) {
560 XMEMCPY(&local[sha512->buffLen], data, add);
561
562 sha512->buffLen += add;
563 data += add;
564 len -= add;
565 }
566
567 if (sha512->buffLen == WC_SHA512_BLOCK_SIZE) {
568 #if defined(LITTLE_ENDIAN_ORDER)
569 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
570 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
571 #endif
572 {
573 ByteReverseWords64(sha512->buffer, sha512->buffer,
574 WC_SHA512_BLOCK_SIZE);
575 }
576 #endif
577 ret = Transform_Sha512(sha512);
578 if (ret == 0) {
579 AddLength(sha512, WC_SHA512_BLOCK_SIZE);
580 sha512->buffLen = 0;
581 }
582 else
583 len = 0;
584 }
585 }
586
587#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
588 if (Transform_Sha512_Len_p != NULL) {
589 word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1);
590
591 if (blocksLen > 0) {
592 AddLength(sha512, blocksLen);
593 sha512->data = data;
594 /* Byte reversal performed in function if required. */
595 Transform_Sha512_Len(sha512, blocksLen);
596 data += blocksLen;
597 len -= blocksLen;
598 }
599 }
600 else
601#endif
602#if !defined(LITTLE_ENDIAN_ORDER) || defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
603 {
604 word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1);
605
606 AddLength(sha512, blocksLen);
607 while (len >= WC_SHA512_BLOCK_SIZE) {
608 XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE);
609
610 data += WC_SHA512_BLOCK_SIZE;
611 len -= WC_SHA512_BLOCK_SIZE;
612
613 /* Byte reversal performed in function if required. */
614 ret = Transform_Sha512(sha512);
615 if (ret != 0)
616 break;
617 }
618 }
619#else
620 {
621 word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1);
622
623 AddLength(sha512, blocksLen);
624 while (len >= WC_SHA512_BLOCK_SIZE) {
625 XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE);
626
627 data += WC_SHA512_BLOCK_SIZE;
628 len -= WC_SHA512_BLOCK_SIZE;
629
630 ByteReverseWords64(sha512->buffer, sha512->buffer,
631 WC_SHA512_BLOCK_SIZE);
632 ret = Transform_Sha512(sha512);
633 if (ret != 0)
634 break;
635 }
636 }
637#endif
638
639 if (len > 0) {
640 XMEMCPY(local, data, len);
641 sha512->buffLen = len;
642 }
643
644 return ret;
645}
646
647#ifdef WOLFSSL_SHA512
648
649int wc_Sha512Update(wc_Sha512* sha512, const byte* data, word32 len)
650{
651 if (sha512 == NULL || (data == NULL && len > 0)) {
652 return BAD_FUNC_ARG;
653 }
654
655#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
656 if (sha512->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA512) {
657 #if defined(HAVE_INTEL_QA)
658 return IntelQaSymSha512(&sha512->asyncDev, NULL, data, len);
659 #endif
660 }
661#endif /* WOLFSSL_ASYNC_CRYPT */
662
663 return Sha512Update(sha512, data, len);
664}
665
666#endif /* WOLFSSL_SHA512 */
667
668#endif /* WOLFSSL_IMX6_CAAM */
669
670static WC_INLINE int Sha512Final(wc_Sha512* sha512)
671{
672 byte* local = (byte*)sha512->buffer;
673 int ret;
674
675 if (sha512 == NULL) {
676 return BAD_FUNC_ARG;
677 }
678
679 AddLength(sha512, sha512->buffLen); /* before adding pads */
680
681 local[sha512->buffLen++] = 0x80; /* add 1 */
682
683 /* pad with zeros */
684 if (sha512->buffLen > WC_SHA512_PAD_SIZE) {
685 XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_BLOCK_SIZE - sha512->buffLen);
686 sha512->buffLen += WC_SHA512_BLOCK_SIZE - sha512->buffLen;
687#if defined(LITTLE_ENDIAN_ORDER)
688 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
689 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
690 #endif
691 {
692 ByteReverseWords64(sha512->buffer,sha512->buffer,
693 WC_SHA512_BLOCK_SIZE);
694 }
695#endif /* LITTLE_ENDIAN_ORDER */
696 ret = Transform_Sha512(sha512);
697 if (ret != 0)
698 return ret;
699
700 sha512->buffLen = 0;
701 }
702 XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_PAD_SIZE - sha512->buffLen);
703
704 /* put lengths in bits */
705 sha512->hiLen = (sha512->loLen >> (8 * sizeof(sha512->loLen) - 3)) +
706 (sha512->hiLen << 3);
707 sha512->loLen = sha512->loLen << 3;
708
709 /* store lengths */
710#if defined(LITTLE_ENDIAN_ORDER)
711 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
712 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
713 #endif
714 ByteReverseWords64(sha512->buffer, sha512->buffer, WC_SHA512_PAD_SIZE);
715#endif
716 /* ! length ordering dependent on digest endian type ! */
717
718 sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen;
719 sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen;
720#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
721 if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
722 ByteReverseWords64(&(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
723 &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
724 WC_SHA512_BLOCK_SIZE - WC_SHA512_PAD_SIZE);
725#endif
726 ret = Transform_Sha512(sha512);
727 if (ret != 0)
728 return ret;
729
730 #ifdef LITTLE_ENDIAN_ORDER
731 ByteReverseWords64(sha512->digest, sha512->digest, WC_SHA512_DIGEST_SIZE);
732 #endif
733
734 return 0;
735}
736
737#ifdef WOLFSSL_SHA512
738
739int wc_Sha512FinalRaw(wc_Sha512* sha512, byte* hash)
740{
741#ifdef LITTLE_ENDIAN_ORDER
742 word64 digest[WC_SHA512_DIGEST_SIZE / sizeof(word64)];
743#endif
744
745 if (sha512 == NULL || hash == NULL) {
746 return BAD_FUNC_ARG;
747 }
748
749#ifdef LITTLE_ENDIAN_ORDER
750 ByteReverseWords64((word64*)digest, (word64*)sha512->digest,
751 WC_SHA512_DIGEST_SIZE);
752 XMEMCPY(hash, digest, WC_SHA512_DIGEST_SIZE);
753#else
754 XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE);
755#endif
756
757 return 0;
758}
759
760int wc_Sha512Final(wc_Sha512* sha512, byte* hash)
761{
762 int ret;
763
764 if (sha512 == NULL || hash == NULL) {
765 return BAD_FUNC_ARG;
766 }
767
768#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
769 if (sha512->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA512) {
770 #if defined(HAVE_INTEL_QA)
771 return IntelQaSymSha512(&sha512->asyncDev, hash, NULL,
772 WC_SHA512_DIGEST_SIZE);
773 #endif
774 }
775#endif /* WOLFSSL_ASYNC_CRYPT */
776
777 ret = Sha512Final(sha512);
778 if (ret != 0)
779 return ret;
780
781 XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE);
782
783 return InitSha512(sha512); /* reset state */
784}
785
786
787int wc_InitSha512(wc_Sha512* sha512)
788{
789 return wc_InitSha512_ex(sha512, NULL, INVALID_DEVID);
790}
791
792void wc_Sha512Free(wc_Sha512* sha512)
793{
794 if (sha512 == NULL)
795 return;
796
797#ifdef WOLFSSL_SMALL_STACK_CACHE
798 if (sha512->W != NULL) {
799 XFREE(sha512->W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
800 sha512->W = NULL;
801 }
802#endif
803
804#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
805 wolfAsync_DevCtxFree(&sha512->asyncDev, WOLFSSL_ASYNC_MARKER_SHA512);
806#endif /* WOLFSSL_ASYNC_CRYPT */
807}
808
809
810#if defined(HAVE_INTEL_AVX1)
811
812static word64 mBYTE_FLIP_MASK[] = { 0x0001020304050607, 0x08090a0b0c0d0e0f };
813
814#define W_0 xmm0
815#define W_2 xmm1
816#define W_4 xmm2
817#define W_6 xmm3
818#define W_8 xmm4
819#define W_10 xmm5
820#define W_12 xmm6
821#define W_14 xmm7
822
823#define W_M15 xmm12
824#define W_M7 xmm13
825#define MASK xmm14
826
827#define XTMP1 xmm8
828#define XTMP2 xmm9
829#define XTMP3 xmm10
830#define XTMP4 xmm11
831
832#define XMM_REGS \
833 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \
834 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
835
836#define _VPALIGNR(dest, src1, src2, bits) \
837 "vpalignr $" #bits ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t"
838#define VPALIGNR(dest, src1, src2, bits) \
839 _VPALIGNR(dest, src1, src2, bits)
840
841#define _V_SHIFT_R(dest, src, bits) \
842 "vpsrlq $" #bits ", %%" #src ", %%" #dest "\n\t"
843#define V_SHIFT_R(dest, src, bits) \
844 _V_SHIFT_R(dest, src, bits)
845
846#define _V_SHIFT_L(dest, src, bits) \
847 "vpsllq $" #bits ", %%" #src ", %%" #dest "\n\t"
848#define V_SHIFT_L(dest, src, bits) \
849 _V_SHIFT_L(dest, src, bits)
850
851#define _V_ADD(dest, src1, src2) \
852 "vpaddq %%" #src1 ", %%" #src2 ", %%" #dest "\n\t"
853#define V_ADD(dest, src1, src2) \
854 _V_ADD(dest, src1, src2)
855
856#define _V_XOR(dest, src1, src2) \
857 "vpxor %%" #src1 ", %%" #src2 ", %%" #dest "\n\t"
858#define V_XOR(dest, src1, src2) \
859 _V_XOR(dest, src1, src2)
860
861#define _V_OR(dest, src1, src2) \
862 "vpor %%" #src1 ", %%" #src2 ", %%" #dest "\n\t"
863#define V_OR(dest, src1, src2) \
864 _V_OR(dest, src1, src2)
865
866#define RA %%r8
867#define RB %%r9
868#define RC %%r10
869#define RD %%r11
870#define RE %%r12
871#define RF %%r13
872#define RG %%r14
873#define RH %%r15
874
875#define STATE_REGS "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
876
877#define L1 "%%rax"
878#define L2 "%%rcx"
879#define L3 "%%rdx"
880#define L4 "%%rbx"
881#define WX "%%rsp"
882
883#define WORK_REGS "rax", "rbx", "rcx", "rdx"
884
885#define RND_0_1(a,b,c,d,e,f,g,h,i) \
886 /* L1 = e >>> 23 */ \
887 "rorq $23, " L1 "\n\t" \
888
889#define RND_0_2(a,b,c,d,e,f,g,h,i) \
890 /* L3 = a */ \
891 "movq "#a", " L3 "\n\t" \
892 /* L2 = f */ \
893 "movq "#f", " L2 "\n\t" \
894 /* h += W_X[i] */ \
895 "addq ("#i")*8(" WX "), "#h"\n\t" \
896 /* L2 = f ^ g */ \
897 "xorq "#g", " L2 "\n\t" \
898
899#define RND_0_2_A(a,b,c,d,e,f,g,h,i) \
900 /* L3 = a */ \
901 "movq "#a", " L3 "\n\t" \
902 /* L2 = f */ \
903 "movq "#f", " L2 "\n\t" \
904
905#define RND_0_2_B(a,b,c,d,e,f,g,h,i) \
906 /* h += W_X[i] */ \
907 "addq ("#i")*8(" WX "), "#h"\n\t" \
908 /* L2 = f ^ g */ \
909 "xorq "#g", " L2 "\n\t" \
910
911#define RND_0_3(a,b,c,d,e,f,g,h,i) \
912 /* L1 = (e >>> 23) ^ e */ \
913 "xorq "#e", " L1 "\n\t" \
914 /* L2 = (f ^ g) & e */ \
915 "andq "#e", " L2 "\n\t" \
916
917#define RND_0_4(a,b,c,d,e,f,g,h,i) \
918 /* L1 = ((e >>> 23) ^ e) >>> 4 */ \
919 "rorq $4, " L1 "\n\t" \
920 /* L2 = ((f ^ g) & e) ^ g */ \
921 "xorq "#g", " L2 "\n\t" \
922
923#define RND_0_5(a,b,c,d,e,f,g,h,i) \
924 /* L1 = (((e >>> 23) ^ e) >>> 4) ^ e */ \
925 "xorq "#e", " L1 "\n\t" \
926 /* h += Ch(e,f,g) */ \
927 "addq " L2 ", "#h"\n\t" \
928
929#define RND_0_6(a,b,c,d,e,f,g,h,i) \
930 /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \
931 "rorq $14, " L1 "\n\t" \
932 /* L3 = a ^ b */ \
933 "xorq "#b", " L3 "\n\t" \
934
935#define RND_0_7(a,b,c,d,e,f,g,h,i) \
936 /* h += Sigma1(e) */ \
937 "addq " L1 ", "#h"\n\t" \
938 /* L2 = a */ \
939 "movq "#a", " L2 "\n\t" \
940
941#define RND_0_8(a,b,c,d,e,f,g,h,i) \
942 /* L4 = (a ^ b) & (b ^ c) */ \
943 "andq " L3 ", " L4 "\n\t" \
944 /* L2 = a >>> 5 */ \
945 "rorq $5, " L2 "\n\t" \
946
947#define RND_0_9(a,b,c,d,e,f,g,h,i) \
948 /* L2 = (a >>> 5) ^ a */ \
949 "xorq "#a", " L2 "\n\t" \
950 /* L4 = ((a ^ b) & (b ^ c) ^ b */ \
951 "xorq "#b", " L4 "\n\t" \
952
953#define RND_0_10(a,b,c,d,e,f,g,h,i) \
954 /* L2 = ((a >>> 5) ^ a) >>> 6 */ \
955 "rorq $6, " L2 "\n\t" \
956 /* d += h */ \
957 "addq "#h", "#d"\n\t" \
958
959#define RND_0_11(a,b,c,d,e,f,g,h,i) \
960 /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */ \
961 "xorq "#a", " L2 "\n\t" \
962 /* h += Sigma0(a) */ \
963 "addq " L4 ", "#h"\n\t" \
964
965#define RND_0_12(a,b,c,d,e,f,g,h,i) \
966 /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */ \
967 "rorq $28, " L2 "\n\t" \
968 /* d (= e next RND) */ \
969 "movq "#d", " L1 "\n\t" \
970 /* h += Maj(a,b,c) */ \
971 "addq " L2 ", "#h"\n\t" \
972
973#define RND_1_1(a,b,c,d,e,f,g,h,i) \
974 /* L1 = e >>> 23 */ \
975 "rorq $23, " L1 "\n\t" \
976
977#define RND_1_2(a,b,c,d,e,f,g,h,i) \
978 /* L4 = a */ \
979 "movq "#a", " L4 "\n\t" \
980 /* L2 = f */ \
981 "movq "#f", " L2 "\n\t" \
982 /* h += W_X[i] */ \
983 "addq ("#i")*8(" WX "), "#h"\n\t" \
984 /* L2 = f ^ g */ \
985 "xorq "#g", " L2 "\n\t" \
986
987#define RND_1_2_A(a,b,c,d,e,f,g,h,i) \
988 /* L4 = a */ \
989 "movq "#a", " L4 "\n\t" \
990 /* L2 = f */ \
991 "movq "#f", " L2 "\n\t" \
992
993#define RND_1_2_B(a,b,c,d,e,f,g,h,i) \
994 /* h += W_X[i] */ \
995 "addq ("#i")*8(" WX "), "#h"\n\t" \
996 /* L2 = f ^ g */ \
997 "xorq "#g", " L2 "\n\t" \
998
999#define RND_1_3(a,b,c,d,e,f,g,h,i) \
1000 /* L1 = (e >>> 23) ^ e */ \
1001 "xorq "#e", " L1 "\n\t" \
1002 /* L2 = (f ^ g) & e */ \
1003 "andq "#e", " L2 "\n\t" \
1004
1005#define RND_1_4(a,b,c,d,e,f,g,h,i) \
1006 /* ((e >>> 23) ^ e) >>> 4 */ \
1007 "rorq $4, " L1 "\n\t" \
1008 /* ((f ^ g) & e) ^ g */ \
1009 "xorq "#g", " L2 "\n\t" \
1010
1011#define RND_1_5(a,b,c,d,e,f,g,h,i) \
1012 /* (((e >>> 23) ^ e) >>> 4) ^ e */ \
1013 "xorq "#e", " L1 "\n\t" \
1014 /* h += Ch(e,f,g) */ \
1015 "addq " L2 ", "#h"\n\t" \
1016
1017#define RND_1_6(a,b,c,d,e,f,g,h,i) \
1018 /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \
1019 "rorq $14, " L1 "\n\t" \
1020 /* L4 = a ^ b */ \
1021 "xorq "#b", " L4 "\n\t" \
1022
1023#define RND_1_7(a,b,c,d,e,f,g,h,i) \
1024 /* h += Sigma1(e) */ \
1025 "addq " L1 ", "#h"\n\t" \
1026 /* L2 = a */ \
1027 "movq "#a", " L2 "\n\t" \
1028
1029#define RND_1_8(a,b,c,d,e,f,g,h,i) \
1030 /* L3 = (a ^ b) & (b ^ c) */ \
1031 "andq " L4 ", " L3 "\n\t" \
1032 /* L2 = a >>> 5 */ \
1033 "rorq $5, " L2 "\n\t" \
1034
1035#define RND_1_9(a,b,c,d,e,f,g,h,i) \
1036 /* L2 = (a >>> 5) ^ a */ \
1037 "xorq "#a", " L2 "\n\t" \
1038 /* L3 = ((a ^ b) & (b ^ c) ^ b */ \
1039 "xorq "#b", " L3 "\n\t" \
1040
1041#define RND_1_10(a,b,c,d,e,f,g,h,i) \
1042 /* L2 = ((a >>> 5) ^ a) >>> 6 */ \
1043 "rorq $6, " L2 "\n\t" \
1044 /* d += h */ \
1045 "addq "#h", "#d"\n\t" \
1046
1047#define RND_1_11(a,b,c,d,e,f,g,h,i) \
1048 /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */ \
1049 "xorq "#a", " L2 "\n\t" \
1050 /* h += Sigma0(a) */ \
1051 "addq " L3 ", "#h"\n\t" \
1052
1053#define RND_1_12(a,b,c,d,e,f,g,h,i) \
1054 /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */ \
1055 "rorq $28, " L2 "\n\t" \
1056 /* d (= e next RND) */ \
1057 "movq "#d", " L1 "\n\t" \
1058 /* h += Maj(a,b,c) */ \
1059 "addq " L2 ", "#h"\n\t" \
1060
1061
1062#define MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \
1063 RND_0_1(a,b,c,d,e,f,g,h,i) \
1064 VPALIGNR(W_M15, W_2, W_0, 8) \
1065 VPALIGNR(W_M7, W_10, W_8, 8) \
1066 RND_0_2(a,b,c,d,e,f,g,h,i) \
1067 V_SHIFT_R(XTMP1, W_M15, 1) \
1068 V_SHIFT_L(XTMP2, W_M15, 63) \
1069 RND_0_3(a,b,c,d,e,f,g,h,i) \
1070 RND_0_4(a,b,c,d,e,f,g,h,i) \
1071 V_SHIFT_R(XTMP3, W_M15, 8) \
1072 V_SHIFT_L(XTMP4, W_M15, 56) \
1073 RND_0_5(a,b,c,d,e,f,g,h,i) \
1074 RND_0_6(a,b,c,d,e,f,g,h,i) \
1075 V_OR(XTMP1, XTMP2, XTMP1) \
1076 V_OR(XTMP3, XTMP4, XTMP3) \
1077 RND_0_7(a,b,c,d,e,f,g,h,i) \
1078 RND_0_8(a,b,c,d,e,f,g,h,i) \
1079 V_SHIFT_R(XTMP4, W_M15, 7) \
1080 V_XOR(XTMP1, XTMP3, XTMP1) \
1081 RND_0_9(a,b,c,d,e,f,g,h,i) \
1082 RND_0_10(a,b,c,d,e,f,g,h,i) \
1083 V_XOR(XTMP1, XTMP4, XTMP1) \
1084 V_ADD(W_0, W_0, W_M7) \
1085 RND_0_11(a,b,c,d,e,f,g,h,i) \
1086 RND_0_12(a,b,c,d,e,f,g,h,i) \
1087 RND_1_1(h,a,b,c,d,e,f,g,i+1) \
1088 V_ADD(W_0, W_0, XTMP1) \
1089 RND_1_2(h,a,b,c,d,e,f,g,i+1) \
1090 V_SHIFT_R(XTMP1, W_14, 19) \
1091 V_SHIFT_L(XTMP2, W_14, 45) \
1092 RND_1_3(h,a,b,c,d,e,f,g,i+1) \
1093 RND_1_4(h,a,b,c,d,e,f,g,i+1) \
1094 V_SHIFT_R(XTMP3, W_14, 61) \
1095 V_SHIFT_L(XTMP4, W_14, 3) \
1096 RND_1_5(h,a,b,c,d,e,f,g,i+1) \
1097 RND_1_6(h,a,b,c,d,e,f,g,i+1) \
1098 RND_1_7(h,a,b,c,d,e,f,g,i+1) \
1099 V_OR(XTMP1, XTMP2, XTMP1) \
1100 V_OR(XTMP3, XTMP4, XTMP3) \
1101 RND_1_8(h,a,b,c,d,e,f,g,i+1) \
1102 RND_1_9(h,a,b,c,d,e,f,g,i+1) \
1103 V_XOR(XTMP1, XTMP3, XTMP1) \
1104 V_SHIFT_R(XTMP4, W_14, 6) \
1105 RND_1_10(h,a,b,c,d,e,f,g,i+1) \
1106 RND_1_11(h,a,b,c,d,e,f,g,i+1) \
1107 V_XOR(XTMP1, XTMP4, XTMP1) \
1108 RND_1_12(h,a,b,c,d,e,f,g,i+1) \
1109 V_ADD(W_0, W_0, XTMP1) \
1110
1111#define RND_ALL_2(a, b, c, d, e, f, g, h, i) \
1112 RND_0_1 (a, b, c, d, e, f, g, h, i ) \
1113 RND_0_2 (a, b, c, d, e, f, g, h, i ) \
1114 RND_0_3 (a, b, c, d, e, f, g, h, i ) \
1115 RND_0_4 (a, b, c, d, e, f, g, h, i ) \
1116 RND_0_5 (a, b, c, d, e, f, g, h, i ) \
1117 RND_0_6 (a, b, c, d, e, f, g, h, i ) \
1118 RND_0_7 (a, b, c, d, e, f, g, h, i ) \
1119 RND_0_8 (a, b, c, d, e, f, g, h, i ) \
1120 RND_0_9 (a, b, c, d, e, f, g, h, i ) \
1121 RND_0_10(a, b, c, d, e, f, g, h, i ) \
1122 RND_0_11(a, b, c, d, e, f, g, h, i ) \
1123 RND_0_12(a, b, c, d, e, f, g, h, i ) \
1124 RND_1_1 (h, a, b, c, d, e, f, g, i+1) \
1125 RND_1_2 (h, a, b, c, d, e, f, g, i+1) \
1126 RND_1_3 (h, a, b, c, d, e, f, g, i+1) \
1127 RND_1_4 (h, a, b, c, d, e, f, g, i+1) \
1128 RND_1_5 (h, a, b, c, d, e, f, g, i+1) \
1129 RND_1_6 (h, a, b, c, d, e, f, g, i+1) \
1130 RND_1_7 (h, a, b, c, d, e, f, g, i+1) \
1131 RND_1_8 (h, a, b, c, d, e, f, g, i+1) \
1132 RND_1_9 (h, a, b, c, d, e, f, g, i+1) \
1133 RND_1_10(h, a, b, c, d, e, f, g, i+1) \
1134 RND_1_11(h, a, b, c, d, e, f, g, i+1) \
1135 RND_1_12(h, a, b, c, d, e, f, g, i+1)
1136
1137
1138#if defined(HAVE_INTEL_RORX)
1139
1140#define RND_RORX_0_1(a, b, c, d, e, f, g, h, i) \
1141 /* L1 = e>>>14 */ \
1142 "rorxq $14, "#e", " L1 "\n\t" \
1143 /* L2 = e>>>18 */ \
1144 "rorxq $18, "#e", " L2 "\n\t" \
1145 /* Prev RND: h += Maj(a,b,c) */ \
1146 "addq " L3 ", "#a"\n\t" \
1147
1148#define RND_RORX_0_2(a, b, c, d, e, f, g, h, i) \
1149 /* h += w_k */ \
1150 "addq ("#i")*8(" WX "), "#h"\n\t" \
1151 /* L3 = f */ \
1152 "movq "#f", " L3 "\n\t" \
1153 /* L2 = (e>>>14) ^ (e>>>18) */ \
1154 "xorq " L1 ", " L2 "\n\t" \
1155
1156#define RND_RORX_0_3(a, b, c, d, e, f, g, h, i) \
1157 /* L3 = f ^ g */ \
1158 "xorq "#g", " L3 "\n\t" \
1159 /* L1 = e>>>41 */ \
1160 "rorxq $41, "#e", " L1 "\n\t" \
1161 /* L1 = Sigma1(e) */ \
1162 "xorq " L2 ", " L1 "\n\t" \
1163
1164#define RND_RORX_0_4(a, b, c, d, e, f, g, h, i) \
1165 /* L3 = (f ^ g) & e */ \
1166 "andq "#e", " L3 "\n\t" \
1167 /* h += Sigma1(e) */ \
1168 "addq " L1 ", "#h"\n\t" \
1169 /* L1 = a>>>28 */ \
1170 "rorxq $28, "#a", " L1 "\n\t" \
1171
1172#define RND_RORX_0_5(a, b, c, d, e, f, g, h, i) \
1173 /* L2 = a>>>34 */ \
1174 "rorxq $34, "#a", " L2 "\n\t" \
1175 /* L3 = Ch(e,f,g) */ \
1176 "xorq "#g", " L3 "\n\t" \
1177 /* L2 = (a>>>28) ^ (a>>>34) */ \
1178 "xorq " L1 ", " L2 "\n\t" \
1179
1180#define RND_RORX_0_6(a, b, c, d, e, f, g, h, i) \
1181 /* L1 = a>>>39 */ \
1182 "rorxq $39, "#a", " L1 "\n\t" \
1183 /* h += Ch(e,f,g) */ \
1184 "addq " L3 ", "#h"\n\t" \
1185 /* L1 = Sigma0(a) */ \
1186 "xorq " L2 ", " L1 "\n\t" \
1187
1188#define RND_RORX_0_7(a, b, c, d, e, f, g, h, i) \
1189 /* L3 = b */ \
1190 "movq "#b", " L3 "\n\t" \
1191 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
1192 "addq "#h", "#d"\n\t" \
1193 /* L3 = a ^ b */ \
1194 "xorq "#a", " L3 "\n\t" \
1195
1196#define RND_RORX_0_8(a, b, c, d, e, f, g, h, i) \
1197 /* L4 = (a ^ b) & (b ^ c) */ \
1198 "andq " L3 ", " L4 "\n\t" \
1199 /* h += Sigma0(a) */ \
1200 "addq " L1 ", "#h"\n\t" \
1201 /* L4 = Maj(a,b,c) */ \
1202 "xorq "#b", " L4 "\n\t" \
1203
1204#define RND_RORX_1_1(a, b, c, d, e, f, g, h, i) \
1205 /* L1 = e>>>14 */ \
1206 "rorxq $14, "#e", " L1 "\n\t" \
1207 /* L2 = e>>>18 */ \
1208 "rorxq $18, "#e", " L2 "\n\t" \
1209 /* Prev RND: h += Maj(a,b,c) */ \
1210 "addq " L4 ", "#a"\n\t" \
1211
1212#define RND_RORX_1_2(a, b, c, d, e, f, g, h, i) \
1213 /* h += w_k */ \
1214 "addq ("#i")*8(" WX "), "#h"\n\t" \
1215 /* L4 = f */ \
1216 "movq "#f", " L4 "\n\t" \
1217 /* L2 = (e>>>14) ^ (e>>>18) */ \
1218 "xorq " L1 ", " L2 "\n\t" \
1219
1220#define RND_RORX_1_3(a, b, c, d, e, f, g, h, i) \
1221 /* L4 = f ^ g */ \
1222 "xorq "#g", " L4 "\n\t" \
1223 /* L1 = e>>>41 */ \
1224 "rorxq $41, "#e", " L1 "\n\t" \
1225 /* L1 = Sigma1(e) */ \
1226 "xorq " L2 ", " L1 "\n\t" \
1227
1228#define RND_RORX_1_4(a, b, c, d, e, f, g, h, i) \
1229 /* L4 = (f ^ g) & e */ \
1230 "andq "#e", " L4 "\n\t" \
1231 /* h += Sigma1(e) */ \
1232 "addq " L1 ", "#h"\n\t" \
1233 /* L1 = a>>>28 */ \
1234 "rorxq $28, "#a", " L1 "\n\t" \
1235
1236#define RND_RORX_1_5(a, b, c, d, e, f, g, h, i) \
1237 /* L2 = a>>>34 */ \
1238 "rorxq $34, "#a", " L2 "\n\t" \
1239 /* L4 = Ch(e,f,g) */ \
1240 "xorq "#g", " L4 "\n\t" \
1241 /* L2 = (a>>>28) ^ (a>>>34) */ \
1242 "xorq " L1 ", " L2 "\n\t" \
1243
1244#define RND_RORX_1_6(a, b, c, d, e, f, g, h, i) \
1245 /* L1 = a>>>39 */ \
1246 "rorxq $39, "#a", " L1 "\n\t" \
1247 /* h += Ch(e,f,g) */ \
1248 "addq " L4 ", "#h"\n\t" \
1249 /* L1 = Sigma0(a) */ \
1250 "xorq " L2 ", " L1 "\n\t" \
1251
1252#define RND_RORX_1_7(a, b, c, d, e, f, g, h, i) \
1253 /* L4 = b */ \
1254 "movq "#b", " L4 "\n\t" \
1255 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
1256 "addq "#h", "#d"\n\t" \
1257 /* L4 = a ^ b */ \
1258 "xorq "#a", " L4 "\n\t" \
1259
1260#define RND_RORX_1_8(a, b, c, d, e, f, g, h, i) \
1261 /* L2 = (a ^ b) & (b ^ c) */ \
1262 "andq " L4 ", " L3 "\n\t" \
1263 /* h += Sigma0(a) */ \
1264 "addq " L1 ", "#h"\n\t" \
1265 /* L3 = Maj(a,b,c) */ \
1266 "xorq "#b", " L3 "\n\t" \
1267
1268#define RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i) \
1269 RND_RORX_0_1(a, b, c, d, e, f, g, h, i+0) \
1270 RND_RORX_0_2(a, b, c, d, e, f, g, h, i+0) \
1271 RND_RORX_0_3(a, b, c, d, e, f, g, h, i+0) \
1272 RND_RORX_0_4(a, b, c, d, e, f, g, h, i+0) \
1273 RND_RORX_0_5(a, b, c, d, e, f, g, h, i+0) \
1274 RND_RORX_0_6(a, b, c, d, e, f, g, h, i+0) \
1275 RND_RORX_0_7(a, b, c, d, e, f, g, h, i+0) \
1276 RND_RORX_0_8(a, b, c, d, e, f, g, h, i+0) \
1277 RND_RORX_1_1(h, a, b, c, d, e, f, g, i+1) \
1278 RND_RORX_1_2(h, a, b, c, d, e, f, g, i+1) \
1279 RND_RORX_1_3(h, a, b, c, d, e, f, g, i+1) \
1280 RND_RORX_1_4(h, a, b, c, d, e, f, g, i+1) \
1281 RND_RORX_1_5(h, a, b, c, d, e, f, g, i+1) \
1282 RND_RORX_1_6(h, a, b, c, d, e, f, g, i+1) \
1283 RND_RORX_1_7(h, a, b, c, d, e, f, g, i+1) \
1284 RND_RORX_1_8(h, a, b, c, d, e, f, g, i+1) \
1285
1286#define RND_RORX_ALL_4(a, b, c, d, e, f, g, h, i) \
1287 RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i+0) \
1288 RND_RORX_ALL_2(g, h, a, b, c, d, e, f, i+2)
1289
1290#define MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \
1291 RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \
1292 VPALIGNR(W_M15, W_2, W_0, 8) \
1293 VPALIGNR(W_M7, W_10, W_8, 8) \
1294 RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \
1295 V_SHIFT_R(XTMP1, W_M15, 1) \
1296 V_SHIFT_L(XTMP2, W_M15, 63) \
1297 RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \
1298 V_SHIFT_R(XTMP3, W_M15, 8) \
1299 V_SHIFT_L(XTMP4, W_M15, 56) \
1300 RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \
1301 V_OR(XTMP1, XTMP2, XTMP1) \
1302 V_OR(XTMP3, XTMP4, XTMP3) \
1303 RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \
1304 V_SHIFT_R(XTMP4, W_M15, 7) \
1305 V_XOR(XTMP1, XTMP3, XTMP1) \
1306 RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \
1307 V_XOR(XTMP1, XTMP4, XTMP1) \
1308 V_ADD(W_0, W_0, W_M7) \
1309 RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \
1310 RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \
1311 V_ADD(W_0, W_0, XTMP1) \
1312 RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \
1313 V_SHIFT_R(XTMP1, W_14, 19) \
1314 V_SHIFT_L(XTMP2, W_14, 45) \
1315 RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \
1316 V_SHIFT_R(XTMP3, W_14, 61) \
1317 V_SHIFT_L(XTMP4, W_14, 3) \
1318 RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \
1319 V_OR(XTMP1, XTMP2, XTMP1) \
1320 V_OR(XTMP3, XTMP4, XTMP3) \
1321 RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \
1322 RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \
1323 V_XOR(XTMP1, XTMP3, XTMP1) \
1324 V_SHIFT_R(XTMP4, W_14, 6) \
1325 RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \
1326 RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \
1327 V_XOR(XTMP1, XTMP4, XTMP1) \
1328 RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \
1329 V_ADD(W_0, W_0, XTMP1) \
1330
1331#endif
1332
1333#define _INIT_MASK(mask) \
1334 "vmovdqu %[mask], %%" #mask "\n\t"
1335#define INIT_MASK(mask) \
1336 _INIT_MASK(mask)
1337
1338#define _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) \
1339 "vmovdqu " #i1 "*16(%%" #reg "), %%" #xmm1 "\n\t" \
1340 "vmovdqu " #i2 "*16(%%" #reg "), %%" #xmm2 "\n\t" \
1341 "vpshufb %%" #mask ", %%" #xmm1 ", %%" #xmm1 "\n\t" \
1342 "vpshufb %%" #mask ", %%" #xmm2 ", %%" #xmm2 "\n\t"
1343#define LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) \
1344 _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg)
1345
1346#define LOAD_W(mask, reg) \
1347 /* X0..3(xmm4..7), W[0..15] = buffer[0.15]; */ \
1348 LOAD_W_2(0, 1, W_0 , W_2 , mask, reg) \
1349 LOAD_W_2(2, 3, W_4 , W_6 , mask, reg) \
1350 LOAD_W_2(4, 5, W_8 , W_10, mask, reg) \
1351 LOAD_W_2(6, 7, W_12, W_14, mask, reg)
1352
1353#define _SET_W_X_2(xmm0, xmm1, reg, i) \
1354 "vpaddq " #i "+ 0(%%" #reg "), %%" #xmm0 ", %%xmm8\n\t" \
1355 "vpaddq " #i "+16(%%" #reg "), %%" #xmm1 ", %%xmm9\n\t" \
1356 "vmovdqu %%xmm8, " #i "+ 0(" WX ")\n\t" \
1357 "vmovdqu %%xmm9, " #i "+16(" WX ")\n\t" \
1358
1359#define SET_W_X_2(xmm0, xmm1, reg, i) \
1360 _SET_W_X_2(xmm0, xmm1, reg, i)
1361
1362#define SET_W_X(reg) \
1363 SET_W_X_2(W_0 , W_2 , reg, 0) \
1364 SET_W_X_2(W_4 , W_6 , reg, 32) \
1365 SET_W_X_2(W_8 , W_10, reg, 64) \
1366 SET_W_X_2(W_12, W_14, reg, 96)
1367
1368#define LOAD_DIGEST() \
1369 "movq (%[sha512]), %%r8 \n\t" \
1370 "movq 8(%[sha512]), %%r9 \n\t" \
1371 "movq 16(%[sha512]), %%r10\n\t" \
1372 "movq 24(%[sha512]), %%r11\n\t" \
1373 "movq 32(%[sha512]), %%r12\n\t" \
1374 "movq 40(%[sha512]), %%r13\n\t" \
1375 "movq 48(%[sha512]), %%r14\n\t" \
1376 "movq 56(%[sha512]), %%r15\n\t"
1377
1378#define STORE_ADD_DIGEST() \
1379 "addq %%r8, (%[sha512])\n\t" \
1380 "addq %%r9, 8(%[sha512])\n\t" \
1381 "addq %%r10, 16(%[sha512])\n\t" \
1382 "addq %%r11, 24(%[sha512])\n\t" \
1383 "addq %%r12, 32(%[sha512])\n\t" \
1384 "addq %%r13, 40(%[sha512])\n\t" \
1385 "addq %%r14, 48(%[sha512])\n\t" \
1386 "addq %%r15, 56(%[sha512])\n\t"
1387
1388#define ADD_DIGEST() \
1389 "addq (%[sha512]), %%r8 \n\t" \
1390 "addq 8(%[sha512]), %%r9 \n\t" \
1391 "addq 16(%[sha512]), %%r10\n\t" \
1392 "addq 24(%[sha512]), %%r11\n\t" \
1393 "addq 32(%[sha512]), %%r12\n\t" \
1394 "addq 40(%[sha512]), %%r13\n\t" \
1395 "addq 48(%[sha512]), %%r14\n\t" \
1396 "addq 56(%[sha512]), %%r15\n\t"
1397
1398#define STORE_DIGEST() \
1399 "movq %%r8, (%[sha512])\n\t" \
1400 "movq %%r9, 8(%[sha512])\n\t" \
1401 "movq %%r10, 16(%[sha512])\n\t" \
1402 "movq %%r11, 24(%[sha512])\n\t" \
1403 "movq %%r12, 32(%[sha512])\n\t" \
1404 "movq %%r13, 40(%[sha512])\n\t" \
1405 "movq %%r14, 48(%[sha512])\n\t" \
1406 "movq %%r15, 56(%[sha512])\n\t"
1407
1408#endif /* HAVE_INTEL_AVX1 */
1409
1410
1411/*** Transform Body ***/
1412#if defined(HAVE_INTEL_AVX1)
1413static int Transform_Sha512_AVX1(wc_Sha512* sha512)
1414{
1415 __asm__ __volatile__ (
1416
1417 /* 16 Ws plus loop counter. */
1418 "subq $136, %%rsp\n\t"
1419 "leaq 64(%[sha512]), %%rax\n\t"
1420
1421 INIT_MASK(MASK)
1422 LOAD_DIGEST()
1423
1424 LOAD_W(MASK, rax)
1425
1426 "movl $4, 16*8(" WX ")\n\t"
1427 "leaq %[K512], %%rsi\n\t"
1428 /* b */
1429 "movq %%r9, " L4 "\n\t"
1430 /* e */
1431 "movq %%r12, " L1 "\n\t"
1432 /* b ^ c */
1433 "xorq %%r10, " L4 "\n\t"
1434
1435 "# Start of 16 rounds\n"
1436 "1:\n\t"
1437
1438 SET_W_X(rsi)
1439
1440 "addq $128, %%rsi\n\t"
1441
1442 MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
1443 MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
1444 MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
1445 MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
1446 MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
1447 MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
1448 MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
1449 MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
1450
1451 "subl $1, 16*8(" WX ")\n\t"
1452 "jne 1b\n\t"
1453
1454 SET_W_X(rsi)
1455
1456 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
1457 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
1458 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
1459 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
1460
1461 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
1462 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
1463 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
1464 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
1465
1466 STORE_ADD_DIGEST()
1467
1468 "addq $136, %%rsp\n\t"
1469
1470 :
1471 : [mask] "m" (mBYTE_FLIP_MASK),
1472 [sha512] "r" (sha512),
1473 [K512] "m" (K512)
1474 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
1475 );
1476
1477 return 0;
1478 }
1479
1480static int Transform_Sha512_AVX1_Len(wc_Sha512* sha512, word32 len)
1481{
1482 __asm__ __volatile__ (
1483
1484 "movq 224(%[sha512]), %%rsi\n\t"
1485 "leaq %[K512], %%rdx\n\t"
1486
1487 INIT_MASK(MASK)
1488 LOAD_DIGEST()
1489
1490 "# Start of processing a block\n"
1491 "2:\n\t"
1492
1493 /* 16 Ws plus loop counter and K512. len goes into -4(%rsp).
1494 * Debug needs more stack space. */
1495 "subq $256, %%rsp\n\t"
1496
1497 LOAD_W(MASK, rsi)
1498
1499 "movl $4, 16*8(" WX ")\n\t"
1500 /* b */
1501 "movq %%r9, " L4 "\n\t"
1502 /* e */
1503 "movq %%r12, " L1 "\n\t"
1504 /* b ^ c */
1505 "xorq %%r10, " L4 "\n\t"
1506
1507 SET_W_X(rdx)
1508
1509 "# Start of 16 rounds\n"
1510 "1:\n\t"
1511
1512 "addq $128, %%rdx\n\t"
1513 "movq %%rdx, 17*8(%%rsp)\n\t"
1514
1515 MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
1516 MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
1517 MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
1518 MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
1519 MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
1520 MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
1521 MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
1522 MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
1523
1524 "movq 17*8(%%rsp), %%rdx\n\t"
1525
1526 SET_W_X(rdx)
1527
1528 "subl $1, 16*8(" WX ")\n\t"
1529 "jne 1b\n\t"
1530
1531 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
1532 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
1533 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
1534 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
1535
1536 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
1537 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
1538 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
1539 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
1540
1541 ADD_DIGEST()
1542
1543 "addq $256, %%rsp\n\t"
1544 "leaq %[K512], %%rdx\n\t"
1545 "addq $128, %%rsi\n\t"
1546 "subl $128, %[len]\n\t"
1547
1548 STORE_DIGEST()
1549
1550 "jnz 2b\n\t"
1551
1552 :
1553 : [mask] "m" (mBYTE_FLIP_MASK),
1554 [len] "m" (len),
1555 [sha512] "r" (sha512),
1556 [K512] "m" (K512)
1557 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
1558 );
1559
1560 return 0;
1561}
1562#endif /* HAVE_INTEL_AVX1 */
1563
1564#if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX)
1565static int Transform_Sha512_AVX1_RORX(wc_Sha512* sha512)
1566{
1567 __asm__ __volatile__ (
1568
1569 /* 16 Ws plus loop counter and K512. */
1570 "subq $144, %%rsp\n\t"
1571 "leaq 64(%[sha512]), %%rax\n\t"
1572
1573 INIT_MASK(MASK)
1574 LOAD_DIGEST()
1575
1576 LOAD_W(MASK, rax)
1577
1578 "movl $4, 16*8(" WX ")\n\t"
1579 "leaq %[K512], %%rsi\n\t"
1580 /* L4 = b */
1581 "movq %%r9, " L4 "\n\t"
1582 /* L3 = 0 (add to prev h) */
1583 "xorq " L3 ", " L3 "\n\t"
1584 /* L4 = b ^ c */
1585 "xorq %%r10, " L4 "\n\t"
1586
1587 SET_W_X(rsi)
1588
1589 "# Start of 16 rounds\n"
1590 "1:\n\t"
1591
1592 "addq $128, %%rsi\n\t"
1593
1594 MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
1595 MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
1596 MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
1597 MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
1598 MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
1599 MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
1600 MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
1601 MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
1602
1603 SET_W_X(rsi)
1604
1605 "subl $1, 16*8(" WX ")\n\t"
1606 "jne 1b\n\t"
1607
1608 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
1609 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
1610 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
1611 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
1612
1613 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
1614 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
1615 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
1616 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
1617
1618 /* Prev RND: h += Maj(a,b,c) */
1619 "addq " L3 ", %%r8\n\t"
1620 "addq $144, %%rsp\n\t"
1621
1622 STORE_ADD_DIGEST()
1623
1624 :
1625 : [mask] "m" (mBYTE_FLIP_MASK),
1626 [sha512] "r" (sha512),
1627 [K512] "m" (K512)
1628 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
1629 );
1630
1631 return 0;
1632 }
1633
1634static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512* sha512, word32 len)
1635{
1636 __asm__ __volatile__ (
1637
1638 "movq 224(%[sha512]), %%rsi\n\t"
1639 "leaq %[K512], %%rcx\n\t"
1640
1641 INIT_MASK(MASK)
1642 LOAD_DIGEST()
1643
1644 "# Start of processing a block\n"
1645 "2:\n\t"
1646
1647 /* 16 Ws plus loop counter and K512. len goes into -4(%rsp).
1648 * Debug needs more stack space. */
1649 "subq $256, %%rsp\n\t"
1650
1651 LOAD_W(MASK, rsi)
1652
1653 "movl $4, 16*8(" WX ")\n\t"
1654 /* L4 = b */
1655 "movq %%r9, " L4 "\n\t"
1656 /* L3 = 0 (add to prev h) */
1657 "xorq " L3 ", " L3 "\n\t"
1658 /* L4 = b ^ c */
1659 "xorq %%r10, " L4 "\n\t"
1660
1661 SET_W_X(rcx)
1662
1663 "# Start of 16 rounds\n"
1664 "1:\n\t"
1665
1666 "addq $128, %%rcx\n\t"
1667 "movq %%rcx, 17*8(%%rsp)\n\t"
1668
1669 MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
1670 MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
1671 MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
1672 MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
1673 MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
1674 MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
1675 MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
1676 MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
1677
1678 "movq 17*8(%%rsp), %%rcx\n\t"
1679
1680 SET_W_X(rcx)
1681
1682 "subl $1, 16*8(" WX ")\n\t"
1683 "jne 1b\n\t"
1684
1685 SET_W_X(rcx)
1686
1687 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
1688 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
1689 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
1690 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
1691
1692 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
1693 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
1694 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
1695 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
1696
1697 /* Prev RND: h += Maj(a,b,c) */
1698 "addq " L3 ", %%r8\n\t"
1699 "addq $256, %%rsp\n\t"
1700
1701 ADD_DIGEST()
1702
1703 "leaq %[K512], %%rcx\n\t"
1704 "addq $128, %%rsi\n\t"
1705 "subl $128, %[len]\n\t"
1706
1707 STORE_DIGEST()
1708
1709 "jnz 2b\n\t"
1710
1711 :
1712 : [mask] "m" (mBYTE_FLIP_MASK),
1713 [len] "m" (len),
1714 [sha512] "r" (sha512),
1715 [K512] "m" (K512)
1716 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
1717 );
1718
1719 return 0;
1720}
1721#endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_RORX */
1722
1723#if defined(HAVE_INTEL_AVX2)
1724static const unsigned long mBYTE_FLIP_MASK_Y[] =
1725 { 0x0001020304050607, 0x08090a0b0c0d0e0f,
1726 0x0001020304050607, 0x08090a0b0c0d0e0f };
1727
1728#define W_Y_0 ymm0
1729#define W_Y_4 ymm1
1730#define W_Y_8 ymm2
1731#define W_Y_12 ymm3
1732
1733#define X0 xmm0
1734#define X1 xmm1
1735#define X2 xmm2
1736#define X3 xmm3
1737#define X4 xmm4
1738#define X5 xmm5
1739#define X6 xmm6
1740#define X7 xmm7
1741#define X8 xmm8
1742#define X9 xmm9
1743#define Y0 ymm0
1744#define Y1 ymm1
1745#define Y2 ymm2
1746#define Y3 ymm3
1747#define Y4 ymm4
1748#define Y5 ymm5
1749#define Y6 ymm6
1750#define Y7 ymm7
1751
1752#define W_Y_M15 ymm12
1753#define W_Y_M7 ymm13
1754#define W_Y_M2 ymm14
1755#define MASK_Y ymm15
1756
1757#define YTMP1 ymm8
1758#define YTMP2 ymm9
1759#define YTMP3 ymm10
1760#define YTMP4 ymm11
1761
1762#define YMM_REGS \
1763 "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", \
1764 "xmm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15"
1765
1766#define _VPERM2I128(dest, src1, src2, sel) \
1767 "vperm2I128 $" #sel ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t"
1768#define VPERM2I128(dest, src1, src2, sel) \
1769 _VPERM2I128(dest, src1, src2, sel)
1770
1771#define _VPERMQ(dest, src, sel) \
1772 "vpermq $" #sel ", %%" #src ", %%" #dest "\n\t"
1773#define VPERMQ(dest, src, sel) \
1774 _VPERMQ(dest, src, sel)
1775
1776#define _VPBLENDD(dest, src1, src2, sel) \
1777 "vpblendd $" #sel ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t"
1778#define VPBLENDD(dest, src1, src2, sel) \
1779 _VPBLENDD(dest, src1, src2, sel)
1780
1781#define _V_ADD_I(dest, src1, addr, i) \
1782 "vpaddq "#i"*8(%%" #addr "), %%" #src1 ", %%" #dest "\n\t"
1783#define V_ADD_I(dest, src1, addr, i) \
1784 _V_ADD_I(dest, src1, addr, i)
1785
1786#define _VMOVDQU_I(addr, i, src) \
1787 "vmovdqu %%" #src ", " #i "*8(%%" #addr ")\n\t"
1788#define VMOVDQU_I(addr, i, src) \
1789 _VMOVDQU_I(addr, i, src)
1790
1791#define MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \
1792 RND_0_1(a,b,c,d,e,f,g,h,i) \
1793 /* W[-13]..W[-15], W[-12] */ \
1794 VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03) \
1795 /* W[-5]..W[-7], W[-4] */ \
1796 VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03) \
1797 RND_0_2(a,b,c,d,e,f,g,h,i) \
1798 RND_0_3(a,b,c,d,e,f,g,h,i) \
1799 /* W_Y_M15 = W[-12]..W[-15] */ \
1800 VPERMQ(W_Y_M15, W_Y_M15, 0x39) \
1801 RND_0_4(a,b,c,d,e,f,g,h,i) \
1802 /* W_Y_M7 = W[-4]..W[-7] */ \
1803 VPERMQ(W_Y_M7, W_Y_M7, 0x39) \
1804 RND_0_5(a,b,c,d,e,f,g,h,i) \
1805 RND_0_6(a,b,c,d,e,f,g,h,i) \
1806 /* W[-15] >> 1 */ \
1807 V_SHIFT_R(YTMP1, W_Y_M15, 1) \
1808 RND_0_7(a,b,c,d,e,f,g,h,i) \
1809 /* W[-15] << 63 */ \
1810 V_SHIFT_L(YTMP2, W_Y_M15, 63) \
1811 RND_0_8(a,b,c,d,e,f,g,h,i) \
1812 /* W[-15] >> 8 */ \
1813 V_SHIFT_R(YTMP3, W_Y_M15, 8) \
1814 RND_0_9(a,b,c,d,e,f,g,h,i) \
1815 /* W[-15] << 56 */ \
1816 V_SHIFT_L(YTMP4, W_Y_M15, 56) \
1817 RND_0_10(a,b,c,d,e,f,g,h,i) \
1818 /* W[-15] >>> 1 */ \
1819 V_OR(YTMP1, YTMP2, YTMP1) \
1820 RND_0_11(a,b,c,d,e,f,g,h,i) \
1821 /* W[-15] >>> 8 */ \
1822 V_OR(YTMP3, YTMP4, YTMP3) \
1823 RND_0_12(a,b,c,d,e,f,g,h,i) \
1824 RND_1_1(h,a,b,c,d,e,f,g,i+1) \
1825 /* W[-15] >> 7 */ \
1826 V_SHIFT_R(YTMP4, W_Y_M15, 7) \
1827 RND_1_2_A(h,a,b,c,d,e,f,g,i+1) \
1828 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */ \
1829 V_XOR(YTMP1, YTMP3, YTMP1) \
1830 RND_1_2_B(h,a,b,c,d,e,f,g,i+1) \
1831 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */ \
1832 V_XOR(YTMP1, YTMP4, YTMP1) \
1833 RND_1_3(h,a,b,c,d,e,f,g,i+1) \
1834 /* W[0] = W[-16] + W[-7] */ \
1835 V_ADD(W_Y_0, W_Y_0, W_Y_M7) \
1836 RND_1_4(h,a,b,c,d,e,f,g,i+1) \
1837 /* W[0] = W[-16] + W[-7] + s0(W[-15]) */ \
1838 V_ADD(W_Y_0, W_Y_0, YTMP1) \
1839 RND_1_5(h,a,b,c,d,e,f,g,i+1) \
1840 /* 0, 0, W[-1], W[-2] */ \
1841 VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81) \
1842 RND_1_6(h,a,b,c,d,e,f,g,i+1) \
1843 RND_1_7(h,a,b,c,d,e,f,g,i+1) \
1844 RND_1_8(h,a,b,c,d,e,f,g,i+1) \
1845 /* W[-2] >> 19 */ \
1846 V_SHIFT_R(YTMP1, W_Y_M2, 19) \
1847 RND_1_9(h,a,b,c,d,e,f,g,i+1) \
1848 /* W[-2] << 45 */ \
1849 V_SHIFT_L(YTMP2, W_Y_M2, 45) \
1850 RND_1_10(h,a,b,c,d,e,f,g,i+1) \
1851 /* W[-2] >> 61 */ \
1852 V_SHIFT_R(YTMP3, W_Y_M2, 61) \
1853 RND_1_11(h,a,b,c,d,e,f,g,i+1) \
1854 /* W[-2] << 3 */ \
1855 V_SHIFT_L(YTMP4, W_Y_M2, 3) \
1856 RND_1_12(h,a,b,c,d,e,f,g,i+1) \
1857 RND_0_1(g,h,a,b,c,d,e,f,i+2) \
1858 /* W[-2] >>> 19 */ \
1859 V_OR(YTMP1, YTMP2, YTMP1) \
1860 RND_0_2(g,h,a,b,c,d,e,f,i+2) \
1861 /* W[-2] >>> 61 */ \
1862 V_OR(YTMP3, YTMP4, YTMP3) \
1863 RND_0_3(g,h,a,b,c,d,e,f,i+2) \
1864 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \
1865 V_XOR(YTMP1, YTMP3, YTMP1) \
1866 RND_0_4(g,h,a,b,c,d,e,f,i+2) \
1867 /* W[-2] >> 6 */ \
1868 V_SHIFT_R(YTMP4, W_Y_M2, 6) \
1869 RND_0_5(g,h,a,b,c,d,e,f,i+2) \
1870 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \
1871 V_XOR(YTMP1, YTMP4, YTMP1) \
1872 RND_0_6(g,h,a,b,c,d,e,f,i+2) \
1873 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \
1874 V_ADD(W_Y_0, W_Y_0, YTMP1) \
1875 RND_0_7(g,h,a,b,c,d,e,f,i+2) \
1876 RND_0_8(g,h,a,b,c,d,e,f,i+2) \
1877 /* W[1], W[0], 0, 0 */ \
1878 VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08) \
1879 RND_0_9(g,h,a,b,c,d,e,f,i+2) \
1880 RND_0_10(g,h,a,b,c,d,e,f,i+2) \
1881 /* W[-2] >> 19 */ \
1882 V_SHIFT_R(YTMP1, W_Y_M2, 19) \
1883 RND_0_11(g,h,a,b,c,d,e,f,i+2) \
1884 /* W[-2] << 45 */ \
1885 V_SHIFT_L(YTMP2, W_Y_M2, 45) \
1886 RND_0_12(g,h,a,b,c,d,e,f,i+2) \
1887 RND_1_1(f,g,h,a,b,c,d,e,i+3) \
1888 /* W[-2] >> 61 */ \
1889 V_SHIFT_R(YTMP3, W_Y_M2, 61) \
1890 RND_1_2(f,g,h,a,b,c,d,e,i+3) \
1891 /* W[-2] << 3 */ \
1892 V_SHIFT_L(YTMP4, W_Y_M2, 3) \
1893 RND_1_3(f,g,h,a,b,c,d,e,i+3) \
1894 /* W[-2] >>> 19 */ \
1895 V_OR(YTMP1, YTMP2, YTMP1) \
1896 RND_1_4(f,g,h,a,b,c,d,e,i+3) \
1897 /* W[-2] >>> 61 */ \
1898 V_OR(YTMP3, YTMP4, YTMP3) \
1899 RND_1_5(f,g,h,a,b,c,d,e,i+3) \
1900 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \
1901 V_XOR(YTMP1, YTMP3, YTMP1) \
1902 RND_1_6(f,g,h,a,b,c,d,e,i+3) \
1903 /* W[-2] >> 6 */ \
1904 V_SHIFT_R(YTMP4, W_Y_M2, 6) \
1905 RND_1_7(f,g,h,a,b,c,d,e,i+3) \
1906 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \
1907 V_XOR(YTMP1, YTMP4, YTMP1) \
1908 RND_1_8(f,g,h,a,b,c,d,e,i+3) \
1909 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \
1910 V_ADD(W_Y_0, W_Y_0, YTMP1) \
1911 RND_1_9(f,g,h,a,b,c,d,e,i+3) \
1912 RND_1_10(f,g,h,a,b,c,d,e,i+3) \
1913 RND_1_11(f,g,h,a,b,c,d,e,i+3) \
1914 RND_1_12(f,g,h,a,b,c,d,e,i+3) \
1915
1916#define MsgSched2_AVX2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \
1917 RND_0_1(a,b,c,d,e,f,g,h,i) \
1918 VPALIGNR(W_Y_M15, W_2, W_0, 8) \
1919 VPALIGNR(W_Y_M7, W_10, W_8, 8) \
1920 RND_0_2(a,b,c,d,e,f,g,h,i) \
1921 V_SHIFT_R(YTMP1, W_Y_M15, 1) \
1922 V_SHIFT_L(YTMP2, W_Y_M15, 63) \
1923 RND_0_3(a,b,c,d,e,f,g,h,i) \
1924 RND_0_4(a,b,c,d,e,f,g,h,i) \
1925 V_SHIFT_R(YTMP3, W_Y_M15, 8) \
1926 V_SHIFT_L(YTMP4, W_Y_M15, 56) \
1927 RND_0_5(a,b,c,d,e,f,g,h,i) \
1928 RND_0_6(a,b,c,d,e,f,g,h,i) \
1929 V_OR(YTMP1, YTMP2, YTMP1) \
1930 V_OR(YTMP3, YTMP4, YTMP3) \
1931 RND_0_7(a,b,c,d,e,f,g,h,i) \
1932 RND_0_8(a,b,c,d,e,f,g,h,i) \
1933 V_SHIFT_R(YTMP4, W_Y_M15, 7) \
1934 V_XOR(YTMP1, YTMP3, YTMP1) \
1935 RND_0_9(a,b,c,d,e,f,g,h,i) \
1936 RND_0_10(a,b,c,d,e,f,g,h,i) \
1937 V_XOR(YTMP1, YTMP4, YTMP1) \
1938 V_ADD(W_0, W_0, W_Y_M7) \
1939 RND_0_11(a,b,c,d,e,f,g,h,i) \
1940 RND_0_12(a,b,c,d,e,f,g,h,i) \
1941 RND_1_1(h,a,b,c,d,e,f,g,i+1) \
1942 V_ADD(W_0, W_0, YTMP1) \
1943 RND_1_2(h,a,b,c,d,e,f,g,i+1) \
1944 V_SHIFT_R(YTMP1, W_14, 19) \
1945 V_SHIFT_L(YTMP2, W_14, 45) \
1946 RND_1_3(h,a,b,c,d,e,f,g,i+1) \
1947 RND_1_4(h,a,b,c,d,e,f,g,i+1) \
1948 V_SHIFT_R(YTMP3, W_14, 61) \
1949 V_SHIFT_L(YTMP4, W_14, 3) \
1950 RND_1_5(h,a,b,c,d,e,f,g,i+1) \
1951 RND_1_6(h,a,b,c,d,e,f,g,i+1) \
1952 RND_1_7(h,a,b,c,d,e,f,g,i+1) \
1953 V_OR(YTMP1, YTMP2, YTMP1) \
1954 V_OR(YTMP3, YTMP4, YTMP3) \
1955 RND_1_8(h,a,b,c,d,e,f,g,i+1) \
1956 RND_1_9(h,a,b,c,d,e,f,g,i+1) \
1957 V_XOR(YTMP1, YTMP3, YTMP1) \
1958 V_SHIFT_R(YTMP4, W_14, 6) \
1959 RND_1_10(h,a,b,c,d,e,f,g,i+1) \
1960 RND_1_11(h,a,b,c,d,e,f,g,i+1) \
1961 V_XOR(YTMP1, YTMP4, YTMP1) \
1962 RND_1_12(h,a,b,c,d,e,f,g,i+1) \
1963 V_ADD(W_0, W_0, YTMP1) \
1964
1965#define MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \
1966 RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \
1967 /* W[-13]..W[-15], W[-12] */ \
1968 VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03) \
1969 /* W[-5]..W[-7], W[-4] */ \
1970 VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03) \
1971 RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \
1972 /* W_Y_M15 = W[-12]..W[-15] */ \
1973 VPERMQ(W_Y_M15, W_Y_M15, 0x39) \
1974 RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \
1975 /* W_Y_M7 = W[-4]..W[-7] */ \
1976 VPERMQ(W_Y_M7, W_Y_M7, 0x39) \
1977 RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \
1978 /* W[-15] >> 1 */ \
1979 V_SHIFT_R(YTMP1, W_Y_M15, 1) \
1980 /* W[-15] << 63 */ \
1981 V_SHIFT_L(YTMP2, W_Y_M15, 63) \
1982 RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \
1983 /* W[-15] >> 8 */ \
1984 V_SHIFT_R(YTMP3, W_Y_M15, 8) \
1985 /* W[-15] << 56 */ \
1986 V_SHIFT_L(YTMP4, W_Y_M15, 56) \
1987 /* W[-15] >>> 1 */ \
1988 V_OR(YTMP1, YTMP2, YTMP1) \
1989 /* W[-15] >>> 8 */ \
1990 V_OR(YTMP3, YTMP4, YTMP3) \
1991 RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \
1992 /* W[-15] >> 7 */ \
1993 V_SHIFT_R(YTMP4, W_Y_M15, 7) \
1994 RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \
1995 /* 0, 0, W[-1], W[-2] */ \
1996 VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81) \
1997 RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \
1998 RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \
1999 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */ \
2000 V_XOR(YTMP1, YTMP3, YTMP1) \
2001 RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \
2002 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */ \
2003 V_XOR(YTMP1, YTMP4, YTMP1) \
2004 RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \
2005 /* W[0] = W[-16] + W[-7] */ \
2006 V_ADD(W_Y_0, W_Y_0, W_Y_M7) \
2007 /* W[0] = W[-16] + W[-7] + s0(W[-15]) */ \
2008 V_ADD(W_Y_0, W_Y_0, YTMP1) \
2009 RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \
2010 /* W[-2] >> 19 */ \
2011 V_SHIFT_R(YTMP1, W_Y_M2, 19) \
2012 /* W[-2] << 45 */ \
2013 V_SHIFT_L(YTMP2, W_Y_M2, 45) \
2014 RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \
2015 /* W[-2] >> 61 */ \
2016 V_SHIFT_R(YTMP3, W_Y_M2, 61) \
2017 /* W[-2] << 3 */ \
2018 V_SHIFT_L(YTMP4, W_Y_M2, 3) \
2019 /* W[-2] >>> 19 */ \
2020 V_OR(YTMP1, YTMP2, YTMP1) \
2021 RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \
2022 /* W[-2] >>> 61 */ \
2023 V_OR(YTMP3, YTMP4, YTMP3) \
2024 RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \
2025 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \
2026 V_XOR(YTMP1, YTMP3, YTMP1) \
2027 RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \
2028 /* W[-2] >> 6 */ \
2029 V_SHIFT_R(YTMP4, W_Y_M2, 6) \
2030 RND_RORX_0_1(g,h,a,b,c,d,e,f,i+2) \
2031 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \
2032 V_XOR(YTMP1, YTMP4, YTMP1) \
2033 RND_RORX_0_2(g,h,a,b,c,d,e,f,i+2) \
2034 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \
2035 V_ADD(W_Y_0, W_Y_0, YTMP1) \
2036 RND_RORX_0_3(g,h,a,b,c,d,e,f,i+2) \
2037 /* W[1], W[0], 0, 0 */ \
2038 VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08) \
2039 RND_RORX_0_4(g,h,a,b,c,d,e,f,i+2) \
2040 RND_RORX_0_5(g,h,a,b,c,d,e,f,i+2) \
2041 /* W[-2] >> 19 */ \
2042 V_SHIFT_R(YTMP1, W_Y_M2, 19) \
2043 /* W[-2] << 45 */ \
2044 V_SHIFT_L(YTMP2, W_Y_M2, 45) \
2045 RND_RORX_0_6(g,h,a,b,c,d,e,f,i+2) \
2046 /* W[-2] >> 61 */ \
2047 V_SHIFT_R(YTMP3, W_Y_M2, 61) \
2048 /* W[-2] << 3 */ \
2049 V_SHIFT_L(YTMP4, W_Y_M2, 3) \
2050 /* W[-2] >>> 19 */ \
2051 V_OR(YTMP1, YTMP2, YTMP1) \
2052 RND_RORX_0_7(g,h,a,b,c,d,e,f,i+2) \
2053 /* W[-2] >>> 61 */ \
2054 V_OR(YTMP3, YTMP4, YTMP3) \
2055 RND_RORX_0_8(g,h,a,b,c,d,e,f,i+2) \
2056 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \
2057 V_XOR(YTMP1, YTMP3, YTMP1) \
2058 RND_RORX_1_1(f,g,h,a,b,c,d,e,i+3) \
2059 /* W[-2] >> 6 */ \
2060 V_SHIFT_R(YTMP4, W_Y_M2, 6) \
2061 RND_RORX_1_2(f,g,h,a,b,c,d,e,i+3) \
2062 RND_RORX_1_3(f,g,h,a,b,c,d,e,i+3) \
2063 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \
2064 V_XOR(YTMP1, YTMP4, YTMP1) \
2065 RND_RORX_1_4(f,g,h,a,b,c,d,e,i+3) \
2066 RND_RORX_1_5(f,g,h,a,b,c,d,e,i+3) \
2067 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \
2068 V_ADD(W_Y_0, W_Y_0, YTMP1) \
2069 RND_RORX_1_6(f,g,h,a,b,c,d,e,i+3) \
2070 V_ADD_I(YTMP1, W_Y_0, rsi, i) \
2071 RND_RORX_1_7(f,g,h,a,b,c,d,e,i+3) \
2072 RND_RORX_1_8(f,g,h,a,b,c,d,e,i+3) \
2073 VMOVDQU_I(rsp, i, YTMP1) \
2074
2075#define MsgSched2_AVX2_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e, \
2076 f,g,h,i) \
2077 RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \
2078 VPALIGNR(W_Y_M15, W_2, W_0, 8) \
2079 VPALIGNR(W_Y_M7, W_10, W_8, 8) \
2080 RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \
2081 V_SHIFT_R(YTMP1, W_Y_M15, 1) \
2082 V_SHIFT_L(YTMP2, W_Y_M15, 63) \
2083 RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \
2084 V_SHIFT_R(YTMP3, W_Y_M15, 8) \
2085 V_SHIFT_L(YTMP4, W_Y_M15, 56) \
2086 RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \
2087 V_OR(YTMP1, YTMP2, YTMP1) \
2088 V_OR(YTMP3, YTMP4, YTMP3) \
2089 RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \
2090 V_SHIFT_R(YTMP4, W_Y_M15, 7) \
2091 V_XOR(YTMP1, YTMP3, YTMP1) \
2092 RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \
2093 V_XOR(YTMP1, YTMP4, YTMP1) \
2094 V_ADD(W_0, W_0, W_Y_M7) \
2095 RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \
2096 RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \
2097 V_ADD(W_0, W_0, YTMP1) \
2098 RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \
2099 V_SHIFT_R(YTMP1, W_14, 19) \
2100 V_SHIFT_L(YTMP2, W_14, 45) \
2101 RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \
2102 V_SHIFT_R(YTMP3, W_14, 61) \
2103 V_SHIFT_L(YTMP4, W_14, 3) \
2104 RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \
2105 V_OR(YTMP1, YTMP2, YTMP1) \
2106 V_OR(YTMP3, YTMP4, YTMP3) \
2107 RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \
2108 RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \
2109 V_XOR(YTMP1, YTMP3, YTMP1) \
2110 V_SHIFT_R(YTMP4, W_14, 6) \
2111 RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \
2112 RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \
2113 V_XOR(YTMP1, YTMP4, YTMP1) \
2114 RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \
2115 V_ADD(W_0, W_0, YTMP1) \
2116
2117
2118#define _INIT_MASK_Y(mask) \
2119 "vmovdqu %[mask], %%"#mask"\n\t"
2120#define INIT_MASK_Y(mask) \
2121 _INIT_MASK_Y(mask)
2122
2123/* Load into YMM registers and swap endian. */
2124#define _LOAD_BLOCK_W_Y_2(mask, ymm0, ymm1, reg, i) \
2125 /* buffer[0..15] => ymm0..ymm3; */ \
2126 "vmovdqu " #i "+ 0(%%" #reg "), %%" #ymm0 "\n\t" \
2127 "vmovdqu " #i "+32(%%" #reg "), %%" #ymm1 "\n\t" \
2128 "vpshufb %%" #mask ", %%" #ymm0 ", %%" #ymm0 "\n\t" \
2129 "vpshufb %%" #mask ", %%" #ymm1 ", %%" #ymm1 "\n\t"
2130
2131#define LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i) \
2132 _LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i)
2133
2134#define LOAD_BLOCK_W_Y(mask, reg) \
2135 LOAD_BLOCK_W_Y_2(mask, W_Y_0, W_Y_4 , reg, 0) \
2136 LOAD_BLOCK_W_Y_2(mask, W_Y_8, W_Y_12, reg, 64)
2137
2138#define _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) \
2139 "vpaddq " #i "+ 0(%%" #reg "), %%" #ymm0 ", %%" #ymm2 "\n\t" \
2140 "vpaddq " #i "+32(%%" #reg "), %%" #ymm1 ", %%" #ymm3 "\n\t" \
2141 "vmovdqu %%" #ymm2 ", " #i "+ 0(" WX ")\n\t" \
2142 "vmovdqu %%" #ymm3 ", " #i "+32(" WX ")\n\t"
2143
2144#define SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) \
2145 _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i)
2146
2147#define SET_BLOCK_W_Y(reg) \
2148 SET_W_Y_2(W_Y_0, W_Y_4 , YTMP1, YTMP2, reg, 0) \
2149 SET_W_Y_2(W_Y_8, W_Y_12, YTMP1, YTMP2, reg, 64)
2150
2151/* Load into YMM registers and swap endian. */
2152#define _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) \
2153 "vmovdqu " #i "+ 0(%%" #reg "), %%" #X0 "\n\t" \
2154 "vmovdqu " #i "+ 16(%%" #reg "), %%" #X1 "\n\t" \
2155 "vmovdqu " #i "+128(%%" #reg "), %%" #X8 "\n\t" \
2156 "vmovdqu " #i "+144(%%" #reg "), %%" #X9 "\n\t" \
2157 "vinserti128 $1, %%" #X8 ", %%" #Y0 ", %%" #Y0 "\n\t" \
2158 "vinserti128 $1, %%" #X9 ", %%" #Y1 ", %%" #Y1 "\n\t" \
2159 "vpshufb %%" #mask ", %%" #Y0 ", %%" #Y0 "\n\t" \
2160 "vpshufb %%" #mask ", %%" #Y1 ", %%" #Y1 "\n\t"
2161
2162#define LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) \
2163 _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i)
2164
2165#define LOAD_BLOCK2_W_Y(mask, reg) \
2166 LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, 0) \
2167 LOAD_BLOCK2_W_Y_2(mask, Y2, Y3, X2, X3, X8, X9, reg, 32) \
2168 LOAD_BLOCK2_W_Y_2(mask, Y4, Y5, X4, X5, X8, X9, reg, 64) \
2169 LOAD_BLOCK2_W_Y_2(mask, Y6, Y7, X6, X7, X8, X9, reg, 96) \
2170
2171#define SET_BLOCK2_W_Y(reg) \
2172 SET_W_Y_2(Y0, Y1, YTMP1, YTMP2, reg, 0) \
2173 SET_W_Y_2(Y2, Y3, YTMP1, YTMP2, reg, 64) \
2174 SET_W_Y_2(Y4, Y5, YTMP1, YTMP2, reg, 128) \
2175 SET_W_Y_2(Y6, Y7, YTMP1, YTMP2, reg, 192)
2176
2177static const word64 K512_AVX2[160] = {
2178 W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
2179 W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
2180 W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
2181 W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
2182 W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
2183 W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
2184 W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
2185 W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
2186 W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
2187 W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
2188 W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
2189 W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
2190 W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
2191 W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
2192 W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
2193 W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
2194 W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
2195 W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
2196 W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
2197 W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
2198 W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
2199 W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
2200 W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
2201 W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
2202 W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
2203 W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
2204 W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
2205 W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
2206 W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
2207 W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
2208 W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
2209 W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
2210 W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
2211 W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
2212 W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
2213 W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
2214 W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
2215 W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
2216 W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
2217 W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
2218 W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
2219 W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
2220 W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
2221 W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
2222 W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
2223 W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
2224 W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
2225 W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
2226 W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
2227 W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
2228 W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
2229 W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
2230 W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
2231 W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
2232 W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
2233 W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
2234 W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
2235 W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
2236 W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
2237 W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
2238 W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
2239 W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
2240 W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
2241 W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
2242 W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
2243 W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
2244 W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
2245 W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
2246 W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
2247 W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
2248 W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
2249 W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
2250 W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
2251 W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
2252 W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
2253 W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
2254 W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
2255 W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
2256 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817),
2257 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
2258};
2259static const word64* K512_AVX2_END = &K512_AVX2[128];
2260
2261static int Transform_Sha512_AVX2(wc_Sha512* sha512)
2262{
2263 __asm__ __volatile__ (
2264
2265 /* 16 Ws plus loop counter and K512. */
2266 "subq $136, %%rsp\n\t"
2267 "leaq 64(%[sha512]), %%rax\n\t"
2268
2269 INIT_MASK(MASK_Y)
2270 LOAD_DIGEST()
2271
2272 LOAD_BLOCK_W_Y(MASK_Y, rax)
2273
2274 "movl $4, 16*8(" WX ")\n\t"
2275 "leaq %[K512], %%rsi\n\t"
2276 /* b */
2277 "movq %%r9, " L4 "\n\t"
2278 /* e */
2279 "movq %%r12, " L1 "\n\t"
2280 /* b ^ c */
2281 "xorq %%r10, " L4 "\n\t"
2282
2283 SET_BLOCK_W_Y(rsi)
2284
2285 "# Start of 16 rounds\n"
2286 "1:\n\t"
2287
2288 "addq $128, %%rsi\n\t"
2289
2290 MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0)
2291 MsgSched4_AVX2(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4)
2292 MsgSched4_AVX2(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8)
2293 MsgSched4_AVX2(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12)
2294
2295 SET_BLOCK_W_Y(rsi)
2296
2297 "subl $1, 16*8(" WX ")\n\t"
2298 "jne 1b\n\t"
2299
2300 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
2301 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
2302 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
2303 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
2304
2305 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
2306 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
2307 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
2308 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
2309
2310 STORE_ADD_DIGEST()
2311
2312 "addq $136, %%rsp\n\t"
2313
2314 :
2315 : [mask] "m" (mBYTE_FLIP_MASK_Y),
2316 [sha512] "r" (sha512),
2317 [K512] "m" (K512)
2318 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
2319 );
2320
2321 return 0;
2322 }
2323
2324static int Transform_Sha512_AVX2_Len(wc_Sha512* sha512, word32 len)
2325{
2326 if ((len & WC_SHA512_BLOCK_SIZE) != 0) {
2327 XMEMCPY(sha512->buffer, sha512->data, WC_SHA512_BLOCK_SIZE);
2328 Transform_Sha512_AVX2(sha512);
2329 sha512->data += WC_SHA512_BLOCK_SIZE;
2330 len -= WC_SHA512_BLOCK_SIZE;
2331 if (len == 0)
2332 return 0;
2333 }
2334
2335 __asm__ __volatile__ (
2336
2337 "movq 224(%[sha512]), %%rcx\n\t"
2338
2339 INIT_MASK(MASK_Y)
2340 LOAD_DIGEST()
2341
2342 "# Start of processing two blocks\n"
2343 "2:\n\t"
2344
2345 "subq $1344, %%rsp\n\t"
2346 "leaq %[K512], %%rsi\n\t"
2347
2348 /* L4 = b */
2349 "movq %%r9, " L4 "\n\t"
2350 /* e */
2351 "movq %%r12, " L1 "\n\t"
2352
2353 LOAD_BLOCK2_W_Y(MASK_Y, rcx)
2354
2355 /* L4 = b ^ c */
2356 "xorq %%r10, " L4 "\n\t"
2357 "\n"
2358 "1:\n\t"
2359 SET_BLOCK2_W_Y(rsi)
2360 MsgSched2_AVX2(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0)
2361 MsgSched2_AVX2(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4)
2362 MsgSched2_AVX2(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8)
2363 MsgSched2_AVX2(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12)
2364 MsgSched2_AVX2(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16)
2365 MsgSched2_AVX2(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20)
2366 MsgSched2_AVX2(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24)
2367 MsgSched2_AVX2(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28)
2368 "addq $256, %%rsi\n\t"
2369 "addq $256, %%rsp\n\t"
2370 "cmpq %[K512_END], %%rsi\n\t"
2371 "jne 1b\n\t"
2372
2373 SET_BLOCK2_W_Y(rsi)
2374 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
2375 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4)
2376 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8)
2377 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12)
2378
2379 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16)
2380 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20)
2381 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24)
2382 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28)
2383 "subq $1024, %%rsp\n\t"
2384
2385 ADD_DIGEST()
2386 STORE_DIGEST()
2387
2388 /* L4 = b */
2389 "movq %%r9, " L4 "\n\t"
2390 /* e */
2391 "movq %%r12, " L1 "\n\t"
2392 /* L4 = b ^ c */
2393 "xorq %%r10, " L4 "\n\t"
2394
2395 "movq $5, %%rsi\n\t"
2396 "\n"
2397 "3:\n\t"
2398 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2)
2399 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6)
2400 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10)
2401 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
2402
2403 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18)
2404 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22)
2405 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26)
2406 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30)
2407 "addq $256, %%rsp\n\t"
2408 "subq $1, %%rsi\n\t"
2409 "jnz 3b\n\t"
2410
2411 ADD_DIGEST()
2412
2413 "movq 224(%[sha512]), %%rcx\n\t"
2414 "addq $64, %%rsp\n\t"
2415 "addq $256, %%rcx\n\t"
2416 "subl $256, %[len]\n\t"
2417 "movq %%rcx, 224(%[sha512])\n\t"
2418
2419 STORE_DIGEST()
2420
2421 "jnz 2b\n\t"
2422
2423 :
2424 : [mask] "m" (mBYTE_FLIP_MASK_Y),
2425 [len] "m" (len),
2426 [sha512] "r" (sha512),
2427 [K512] "m" (K512_AVX2),
2428 [K512_END] "m" (K512_AVX2_END)
2429 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
2430 );
2431
2432 return 0;
2433}
2434
2435#ifdef HAVE_INTEL_RORX
2436static int Transform_Sha512_AVX2_RORX(wc_Sha512* sha512)
2437{
2438 __asm__ __volatile__ (
2439
2440 /* 16 Ws plus loop counter. */
2441 "subq $136, %%rsp\n\t"
2442 "leaq 64(%[sha512]), " L2 "\n\t"
2443
2444 INIT_MASK(MASK_Y)
2445 LOAD_DIGEST()
2446
2447 LOAD_BLOCK_W_Y(MASK_Y, rcx)
2448
2449 "movl $4, 16*8(" WX ")\n\t"
2450 "leaq %[K512], %%rsi\n\t"
2451 /* b */
2452 "movq %%r9, " L4 "\n\t"
2453 /* L3 = 0 (add to prev h) */
2454 "xorq " L3 ", " L3 "\n\t"
2455 /* b ^ c */
2456 "xorq %%r10, " L4 "\n\t"
2457
2458 SET_BLOCK_W_Y(rsi)
2459
2460 "# Start of 16 rounds\n"
2461 "1:\n\t"
2462
2463 "addq $128, %%rsi\n\t"
2464
2465 MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0)
2466 MsgSched4_AVX2_RORX_SET(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4)
2467 MsgSched4_AVX2_RORX_SET(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8)
2468 MsgSched4_AVX2_RORX_SET(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12)
2469
2470 "subl $1, 16*8(%%rsp)\n\t"
2471 "jnz 1b\n\t"
2472
2473 RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 0)
2474 RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD, 4)
2475 RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 8)
2476 RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD,12)
2477 /* Prev RND: h += Maj(a,b,c) */
2478 "addq " L3 ", %%r8\n\t"
2479 "addq $136, %%rsp\n\t"
2480
2481 STORE_ADD_DIGEST()
2482
2483 :
2484 : [mask] "m" (mBYTE_FLIP_MASK_Y),
2485 [sha512] "r" (sha512),
2486 [K512] "m" (K512)
2487 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
2488 );
2489
2490 return 0;
2491}
2492
2493static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512* sha512, word32 len)
2494{
2495 if ((len & WC_SHA512_BLOCK_SIZE) != 0) {
2496 XMEMCPY(sha512->buffer, sha512->data, WC_SHA512_BLOCK_SIZE);
2497 Transform_Sha512_AVX2_RORX(sha512);
2498 sha512->data += WC_SHA512_BLOCK_SIZE;
2499 len -= WC_SHA512_BLOCK_SIZE;
2500 if (len == 0)
2501 return 0;
2502 }
2503
2504 __asm__ __volatile__ (
2505
2506 "movq 224(%[sha512]), %%rax\n\t"
2507
2508 INIT_MASK(MASK_Y)
2509 LOAD_DIGEST()
2510
2511 "# Start of processing two blocks\n"
2512 "2:\n\t"
2513
2514 "subq $1344, %%rsp\n\t"
2515 "leaq %[K512], %%rsi\n\t"
2516
2517 /* L4 = b */
2518 "movq %%r9, " L4 "\n\t"
2519 /* L3 = 0 (add to prev h) */
2520 "xorq " L3 ", " L3 "\n\t"
2521
2522 LOAD_BLOCK2_W_Y(MASK_Y, rax)
2523
2524 /* L4 = b ^ c */
2525 "xorq %%r10, " L4 "\n\t"
2526 "\n"
2527 "1:\n\t"
2528 SET_BLOCK2_W_Y(rsi)
2529 MsgSched2_AVX2_RORX(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0)
2530 MsgSched2_AVX2_RORX(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4)
2531 MsgSched2_AVX2_RORX(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8)
2532 MsgSched2_AVX2_RORX(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12)
2533 MsgSched2_AVX2_RORX(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16)
2534 MsgSched2_AVX2_RORX(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20)
2535 MsgSched2_AVX2_RORX(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24)
2536 MsgSched2_AVX2_RORX(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28)
2537 "addq $256, %%rsi\n\t"
2538 "addq $256, %%rsp\n\t"
2539 "cmpq %[K512_END], %%rsi\n\t"
2540 "jne 1b\n\t"
2541
2542 SET_BLOCK2_W_Y(rsi)
2543 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
2544 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4)
2545 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8)
2546 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12)
2547
2548 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16)
2549 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20)
2550 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24)
2551 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28)
2552 "addq " L3 ", %%r8\n\t"
2553 "subq $1024, %%rsp\n\t"
2554
2555 ADD_DIGEST()
2556 STORE_DIGEST()
2557
2558 /* L4 = b */
2559 "movq %%r9, " L4 "\n\t"
2560 /* L3 = 0 (add to prev h) */
2561 "xorq " L3 ", " L3 "\n\t"
2562 /* L4 = b ^ c */
2563 "xorq %%r10, " L4 "\n\t"
2564
2565 "movq $5, %%rsi\n\t"
2566 "\n"
2567 "3:\n\t"
2568 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2)
2569 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6)
2570 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10)
2571 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
2572
2573 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18)
2574 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22)
2575 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26)
2576 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30)
2577 "addq $256, %%rsp\n\t"
2578 "subq $1, %%rsi\n\t"
2579 "jnz 3b\n\t"
2580
2581 "addq " L3 ", %%r8\n\t"
2582
2583 ADD_DIGEST()
2584
2585 "movq 224(%[sha512]), %%rax\n\t"
2586 "addq $64, %%rsp\n\t"
2587 "addq $256, %%rax\n\t"
2588 "subl $256, %[len]\n\t"
2589 "movq %%rax, 224(%[sha512])\n\t"
2590
2591 STORE_DIGEST()
2592
2593 "jnz 2b\n\t"
2594
2595 :
2596 : [mask] "m" (mBYTE_FLIP_MASK_Y),
2597 [len] "m" (len),
2598 [sha512] "r" (sha512),
2599 [K512] "m" (K512_AVX2),
2600 [K512_END] "m" (K512_AVX2_END)
2601 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
2602 );
2603
2604 return 0;
2605}
2606#endif /* HAVE_INTEL_RORX */
2607#endif /* HAVE_INTEL_AVX2 */
2608
2609#endif /* WOLFSSL_SHA512 */
2610
2611
2612/* -------------------------------------------------------------------------- */
2613/* SHA384 */
2614/* -------------------------------------------------------------------------- */
2615#ifdef WOLFSSL_SHA384
2616
2617#if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH)
2618 /* functions defined in wolfcrypt/src/port/caam/caam_sha.c */
2619#else
2620
2621static int InitSha384(wc_Sha384* sha384)
2622{
2623 if (sha384 == NULL) {
2624 return BAD_FUNC_ARG;
2625 }
2626
2627 sha384->digest[0] = W64LIT(0xcbbb9d5dc1059ed8);
2628 sha384->digest[1] = W64LIT(0x629a292a367cd507);
2629 sha384->digest[2] = W64LIT(0x9159015a3070dd17);
2630 sha384->digest[3] = W64LIT(0x152fecd8f70e5939);
2631 sha384->digest[4] = W64LIT(0x67332667ffc00b31);
2632 sha384->digest[5] = W64LIT(0x8eb44a8768581511);
2633 sha384->digest[6] = W64LIT(0xdb0c2e0d64f98fa7);
2634 sha384->digest[7] = W64LIT(0x47b5481dbefa4fa4);
2635
2636 sha384->buffLen = 0;
2637 sha384->loLen = 0;
2638 sha384->hiLen = 0;
2639
2640 return 0;
2641}
2642
2643int wc_Sha384Update(wc_Sha384* sha384, const byte* data, word32 len)
2644{
2645 if (sha384 == NULL || (data == NULL && len > 0)) {
2646 return BAD_FUNC_ARG;
2647 }
2648
2649#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
2650 if (sha384->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA384) {
2651 #if defined(HAVE_INTEL_QA)
2652 return IntelQaSymSha384(&sha384->asyncDev, NULL, data, len);
2653 #endif
2654 }
2655#endif /* WOLFSSL_ASYNC_CRYPT */
2656
2657 return Sha512Update((wc_Sha512*)sha384, data, len);
2658}
2659
2660
2661int wc_Sha384FinalRaw(wc_Sha384* sha384, byte* hash)
2662{
2663#ifdef LITTLE_ENDIAN_ORDER
2664 word64 digest[WC_SHA384_DIGEST_SIZE / sizeof(word64)];
2665#endif
2666
2667 if (sha384 == NULL || hash == NULL) {
2668 return BAD_FUNC_ARG;
2669 }
2670
2671#ifdef LITTLE_ENDIAN_ORDER
2672 ByteReverseWords64((word64*)digest, (word64*)sha384->digest,
2673 WC_SHA384_DIGEST_SIZE);
2674 XMEMCPY(hash, digest, WC_SHA384_DIGEST_SIZE);
2675#else
2676 XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
2677#endif
2678
2679 return 0;
2680}
2681
2682int wc_Sha384Final(wc_Sha384* sha384, byte* hash)
2683{
2684 int ret;
2685
2686 if (sha384 == NULL || hash == NULL) {
2687 return BAD_FUNC_ARG;
2688 }
2689
2690#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
2691 if (sha384->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA384) {
2692 #if defined(HAVE_INTEL_QA)
2693 return IntelQaSymSha384(&sha384->asyncDev, hash, NULL,
2694 WC_SHA384_DIGEST_SIZE);
2695 #endif
2696 }
2697#endif /* WOLFSSL_ASYNC_CRYPT */
2698
2699 ret = Sha512Final((wc_Sha512*)sha384);
2700 if (ret != 0)
2701 return ret;
2702
2703 XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
2704
2705 return InitSha384(sha384); /* reset state */
2706}
2707
2708int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId)
2709{
2710 int ret;
2711
2712 if (sha384 == NULL) {
2713 return BAD_FUNC_ARG;
2714 }
2715
2716 sha384->heap = heap;
2717 ret = InitSha384(sha384);
2718 if (ret != 0)
2719 return ret;
2720
2721#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
2722 Sha512_SetTransform();
2723#endif
2724#ifdef WOLFSSL_SMALL_STACK_CACHE
2725 sha384->W = NULL;
2726#endif
2727
2728#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
2729 ret = wolfAsync_DevCtxInit(&sha384->asyncDev, WOLFSSL_ASYNC_MARKER_SHA384,
2730 sha384->heap, devId);
2731#else
2732 (void)devId;
2733#endif /* WOLFSSL_ASYNC_CRYPT */
2734
2735 return ret;
2736}
2737
2738#endif /* WOLFSSL_IMX6_CAAM */
2739
2740int wc_InitSha384(wc_Sha384* sha384)
2741{
2742 return wc_InitSha384_ex(sha384, NULL, INVALID_DEVID);
2743}
2744
2745void wc_Sha384Free(wc_Sha384* sha384)
2746{
2747 if (sha384 == NULL)
2748 return;
2749
2750#ifdef WOLFSSL_SMALL_STACK_CACHE
2751 if (sha384->W != NULL) {
2752 XFREE(sha384->W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
2753 sha384->W = NULL;
2754 }
2755#endif
2756
2757#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
2758 wolfAsync_DevCtxFree(&sha384->asyncDev, WOLFSSL_ASYNC_MARKER_SHA384);
2759#endif /* WOLFSSL_ASYNC_CRYPT */
2760}
2761
2762#endif /* WOLFSSL_SHA384 */
2763
2764#endif /* HAVE_FIPS */
2765
2766#ifdef WOLFSSL_SHA512
2767
2768int wc_Sha512GetHash(wc_Sha512* sha512, byte* hash)
2769{
2770 int ret;
2771 wc_Sha512 tmpSha512;
2772
2773 if (sha512 == NULL || hash == NULL)
2774 return BAD_FUNC_ARG;
2775
2776 ret = wc_Sha512Copy(sha512, &tmpSha512);
2777 if (ret == 0) {
2778 ret = wc_Sha512Final(&tmpSha512, hash);
2779 wc_Sha512Free(&tmpSha512);
2780 }
2781 return ret;
2782}
2783
2784int wc_Sha512Copy(wc_Sha512* src, wc_Sha512* dst)
2785{
2786 int ret = 0;
2787
2788 if (src == NULL || dst == NULL)
2789 return BAD_FUNC_ARG;
2790
2791 XMEMCPY(dst, src, sizeof(wc_Sha512));
2792#ifdef WOLFSSL_SMALL_STACK_CACHE
2793 dst->W = NULL;
2794#endif
2795
2796#ifdef WOLFSSL_ASYNC_CRYPT
2797 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
2798#endif
2799
2800 return ret;
2801}
2802
2803#endif /* WOLFSSL_SHA512 */
2804
2805#ifdef WOLFSSL_SHA384
2806
2807int wc_Sha384GetHash(wc_Sha384* sha384, byte* hash)
2808{
2809 int ret;
2810 wc_Sha384 tmpSha384;
2811
2812 if (sha384 == NULL || hash == NULL)
2813 return BAD_FUNC_ARG;
2814
2815 ret = wc_Sha384Copy(sha384, &tmpSha384);
2816 if (ret == 0) {
2817 ret = wc_Sha384Final(&tmpSha384, hash);
2818 wc_Sha384Free(&tmpSha384);
2819 }
2820 return ret;
2821}
2822int wc_Sha384Copy(wc_Sha384* src, wc_Sha384* dst)
2823{
2824 int ret = 0;
2825
2826 if (src == NULL || dst == NULL)
2827 return BAD_FUNC_ARG;
2828
2829 XMEMCPY(dst, src, sizeof(wc_Sha384));
2830#ifdef WOLFSSL_SMALL_STACK_CACHE
2831 dst->W = NULL;
2832#endif
2833
2834#ifdef WOLFSSL_ASYNC_CRYPT
2835 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
2836#endif
2837
2838 return ret;
2839}
2840
2841#endif /* WOLFSSL_SHA384 */
2842
2843#endif /* WOLFSSL_SHA512 || WOLFSSL_SHA384 */
Note: See TracBrowser for help on using the repository browser.