source: azure_iot_hub/trunk/wolfssl-3.15.7/wolfcrypt/src/sha256.c@ 388

Last change on this file since 388 was 388, checked in by coas-nagasima, 5 years ago

Azure IoT Hub Device C SDK を使ったサンプルの追加

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
  • Property svn:mime-type set to text/x-csrc
File size: 111.0 KB
Line 
1/* sha256.c
2 *
3 * Copyright (C) 2006-2017 wolfSSL Inc.
4 *
5 * This file is part of wolfSSL.
6 *
7 * wolfSSL is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * wolfSSL is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
20 */
21
22
23/* code submitted by raphael.huck@efixo.com */
24
25#ifdef HAVE_CONFIG_H
26 #include <config.h>
27#endif
28
29#include <wolfssl/wolfcrypt/settings.h>
30
31#if !defined(NO_SHA256) && !defined(WOLFSSL_ARMASM)
32
33#if defined(HAVE_FIPS) && \
34 defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2)
35
36 /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
37 #define FIPS_NO_WRAPPERS
38
39 #ifdef USE_WINDOWS_API
40 #pragma code_seg(".fipsA$d")
41 #pragma const_seg(".fipsB$d")
42 #endif
43#endif
44
45#include <wolfssl/wolfcrypt/sha256.h>
46#include <wolfssl/wolfcrypt/error-crypt.h>
47#include <wolfssl/wolfcrypt/cpuid.h>
48
49/* fips wrapper calls, user can call direct */
50#if defined(HAVE_FIPS) && \
51 (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION < 2))
52
53 int wc_InitSha256(wc_Sha256* sha)
54 {
55 if (sha == NULL) {
56 return BAD_FUNC_ARG;
57 }
58 return InitSha256_fips(sha);
59 }
60 int wc_InitSha256_ex(wc_Sha256* sha, void* heap, int devId)
61 {
62 (void)heap;
63 (void)devId;
64 if (sha == NULL) {
65 return BAD_FUNC_ARG;
66 }
67 return InitSha256_fips(sha);
68 }
69 int wc_Sha256Update(wc_Sha256* sha, const byte* data, word32 len)
70 {
71 if (sha == NULL || (data == NULL && len > 0)) {
72 return BAD_FUNC_ARG;
73 }
74
75 if (data == NULL && len == 0) {
76 /* valid, but do nothing */
77 return 0;
78 }
79
80 return Sha256Update_fips(sha, data, len);
81 }
82 int wc_Sha256Final(wc_Sha256* sha, byte* out)
83 {
84 if (sha == NULL || out == NULL) {
85 return BAD_FUNC_ARG;
86 }
87 return Sha256Final_fips(sha, out);
88 }
89 void wc_Sha256Free(wc_Sha256* sha)
90 {
91 (void)sha;
92 /* Not supported in FIPS */
93 }
94
95#else /* else build without fips, or for FIPS v2 */
96
97
98#if defined(WOLFSSL_TI_HASH)
99 /* #include <wolfcrypt/src/port/ti/ti-hash.c> included by wc_port.c */
100#else
101
102#include <wolfssl/wolfcrypt/logging.h>
103
104#ifdef NO_INLINE
105 #include <wolfssl/wolfcrypt/misc.h>
106#else
107 #define WOLFSSL_MISC_INCLUDED
108 #include <wolfcrypt/src/misc.c>
109#endif
110
111#ifdef WOLFSSL_DEVCRYPTO_HASH
112 #include <wolfssl/wolfcrypt/port/devcrypto/wc_devcrypto.h>
113#endif
114
115
116
117#if defined(USE_INTEL_SPEEDUP)
118 #if defined(__GNUC__) && ((__GNUC__ < 4) || \
119 (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
120 #undef NO_AVX2_SUPPORT
121 #define NO_AVX2_SUPPORT
122 #endif
123 #if defined(__clang__) && ((__clang_major__ < 3) || \
124 (__clang_major__ == 3 && __clang_minor__ <= 5))
125 #define NO_AVX2_SUPPORT
126 #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
127 #undef NO_AVX2_SUPPORT
128 #endif
129
130 #define HAVE_INTEL_AVX1
131 #ifndef NO_AVX2_SUPPORT
132 #define HAVE_INTEL_AVX2
133 #endif
134#endif /* USE_INTEL_SPEEDUP */
135
136#if defined(HAVE_INTEL_AVX2)
137 #define HAVE_INTEL_RORX
138#endif
139
140
141#if !defined(WOLFSSL_PIC32MZ_HASH) && !defined(STM32_HASH_SHA2) && \
142 (!defined(WOLFSSL_IMX6_CAAM) || defined(NO_IMX6_CAAM_HASH)) && \
143 !defined(WOLFSSL_AFALG_HASH) && !defined(WOLFSSL_DEVCRYPTO_HASH)
144static int InitSha256(wc_Sha256* sha256)
145{
146 int ret = 0;
147
148 if (sha256 == NULL)
149 return BAD_FUNC_ARG;
150
151 XMEMSET(sha256->digest, 0, sizeof(sha256->digest));
152 sha256->digest[0] = 0x6A09E667L;
153 sha256->digest[1] = 0xBB67AE85L;
154 sha256->digest[2] = 0x3C6EF372L;
155 sha256->digest[3] = 0xA54FF53AL;
156 sha256->digest[4] = 0x510E527FL;
157 sha256->digest[5] = 0x9B05688CL;
158 sha256->digest[6] = 0x1F83D9ABL;
159 sha256->digest[7] = 0x5BE0CD19L;
160
161 sha256->buffLen = 0;
162 sha256->loLen = 0;
163 sha256->hiLen = 0;
164
165 return ret;
166}
167#endif
168
169
170/* Hardware Acceleration */
171#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
172
173 /* in case intel instructions aren't available, plus we need the K[] global */
174 #define NEED_SOFT_SHA256
175
176 /*****
177 Intel AVX1/AVX2 Macro Control Structure
178
179 #define HAVE_INTEL_AVX1
180 #define HAVE_INTEL_AVX2
181
182 #define HAVE_INTEL_RORX
183
184
185 int InitSha256(wc_Sha256* sha256) {
186 Save/Recover XMM, YMM
187 ...
188 }
189
190 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
191 Transform_Sha256(); Function prototype
192 #else
193 Transform_Sha256() { }
194 int Sha256Final() {
195 Save/Recover XMM, YMM
196 ...
197 }
198 #endif
199
200 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
201 #if defined(HAVE_INTEL_RORX
202 #define RND with rorx instuction
203 #else
204 #define RND
205 #endif
206 #endif
207
208 #if defined(HAVE_INTEL_AVX1)
209
210 #define XMM Instructions/inline asm
211
212 int Transform_Sha256() {
213 Stitched Message Sched/Round
214 }
215
216 #elif defined(HAVE_INTEL_AVX2)
217
218 #define YMM Instructions/inline asm
219
220 int Transform_Sha256() {
221 More granural Stitched Message Sched/Round
222 }
223
224 #endif
225
226 */
227
228 /* Each platform needs to query info type 1 from cpuid to see if aesni is
229 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
230 */
231
232 /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha256 */
233 static int Transform_Sha256(wc_Sha256* sha256);
234 #if defined(HAVE_INTEL_AVX1)
235 static int Transform_Sha256_AVX1(wc_Sha256 *sha256);
236 static int Transform_Sha256_AVX1_Len(wc_Sha256* sha256, word32 len);
237 #endif
238 #if defined(HAVE_INTEL_AVX2)
239 static int Transform_Sha256_AVX2(wc_Sha256 *sha256);
240 static int Transform_Sha256_AVX2_Len(wc_Sha256* sha256, word32 len);
241 #ifdef HAVE_INTEL_RORX
242 static int Transform_Sha256_AVX1_RORX(wc_Sha256 *sha256);
243 static int Transform_Sha256_AVX1_RORX_Len(wc_Sha256* sha256, word32 len);
244 static int Transform_Sha256_AVX2_RORX(wc_Sha256 *sha256);
245 static int Transform_Sha256_AVX2_RORX_Len(wc_Sha256* sha256, word32 len);
246 #endif
247 #endif
248 static int (*Transform_Sha256_p)(wc_Sha256* sha256);
249 /* = _Transform_Sha256 */
250 static int (*Transform_Sha256_Len_p)(wc_Sha256* sha256, word32 len);
251 /* = NULL */
252 static int transform_check = 0;
253 static word32 intel_flags;
254 #define XTRANSFORM(S) (*Transform_Sha256_p)((S))
255 #define XTRANSFORM_LEN(S, L) (*Transform_Sha256_Len_p)((S),(L))
256
257 static void Sha256_SetTransform(void)
258 {
259
260 if (transform_check)
261 return;
262
263 intel_flags = cpuid_get_flags();
264
265 #ifdef HAVE_INTEL_AVX2
266 if (IS_INTEL_AVX2(intel_flags)) {
267 #ifdef HAVE_INTEL_RORX
268 if (IS_INTEL_BMI2(intel_flags)) {
269 Transform_Sha256_p = Transform_Sha256_AVX2_RORX;
270 Transform_Sha256_Len_p = Transform_Sha256_AVX2_RORX_Len;
271 }
272 else
273 #endif
274 if (1)
275 {
276 Transform_Sha256_p = Transform_Sha256_AVX2;
277 Transform_Sha256_Len_p = Transform_Sha256_AVX2_Len;
278 }
279 #ifdef HAVE_INTEL_RORX
280 else {
281 Transform_Sha256_p = Transform_Sha256_AVX1_RORX;
282 Transform_Sha256_Len_p = Transform_Sha256_AVX1_RORX_Len;
283 }
284 #endif
285 }
286 else
287 #endif
288 #ifdef HAVE_INTEL_AVX1
289 if (IS_INTEL_AVX1(intel_flags)) {
290 Transform_Sha256_p = Transform_Sha256_AVX1;
291 Transform_Sha256_Len_p = Transform_Sha256_AVX1_Len;
292 }
293 else
294 #endif
295 {
296 Transform_Sha256_p = Transform_Sha256;
297 Transform_Sha256_Len_p = NULL;
298 }
299
300 transform_check = 1;
301 }
302
303 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
304 {
305 int ret = 0;
306 if (sha256 == NULL)
307 return BAD_FUNC_ARG;
308
309 sha256->heap = heap;
310
311 ret = InitSha256(sha256);
312 if (ret != 0)
313 return ret;
314
315 /* choose best Transform function under this runtime environment */
316 Sha256_SetTransform();
317
318 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
319 ret = wolfAsync_DevCtxInit(&sha256->asyncDev,
320 WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId);
321 #else
322 (void)devId;
323 #endif /* WOLFSSL_ASYNC_CRYPT */
324
325 return ret;
326 }
327
328#elif defined(FREESCALE_LTC_SHA)
329 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
330 {
331 (void)heap;
332 (void)devId;
333
334 LTC_HASH_Init(LTC_BASE, &sha256->ctx, kLTC_Sha256, NULL, 0);
335
336 return 0;
337 }
338
339#elif defined(FREESCALE_MMCAU_SHA)
340
341 #ifdef FREESCALE_MMCAU_CLASSIC_SHA
342 #include "cau_api.h"
343 #else
344 #include "fsl_mmcau.h"
345 #endif
346
347 #define XTRANSFORM(S) Transform_Sha256((S))
348 #define XTRANSFORM_LEN(S,L) Transform_Sha256_Len((S),(L))
349
350 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
351 {
352 int ret = 0;
353
354 (void)heap;
355 (void)devId;
356
357 ret = wolfSSL_CryptHwMutexLock();
358 if (ret != 0) {
359 return ret;
360 }
361 #ifdef FREESCALE_MMCAU_CLASSIC_SHA
362 cau_sha256_initialize_output(sha256->digest);
363 #else
364 MMCAU_SHA256_InitializeOutput((uint32_t*)sha256->digest);
365 #endif
366 wolfSSL_CryptHwMutexUnLock();
367
368 sha256->buffLen = 0;
369 sha256->loLen = 0;
370 sha256->hiLen = 0;
371
372 return ret;
373 }
374
375 static int Transform_Sha256(wc_Sha256* sha256)
376 {
377 int ret = wolfSSL_CryptHwMutexLock();
378 if (ret == 0) {
379 #ifdef FREESCALE_MMCAU_CLASSIC_SHA
380 cau_sha256_hash_n((byte*)sha256->buffer, 1, sha256->digest);
381 #else
382 MMCAU_SHA256_HashN((byte*)sha256->buffer, 1, sha256->digest);
383 #endif
384 wolfSSL_CryptHwMutexUnLock();
385 }
386 return ret;
387 }
388
389#elif defined(WOLFSSL_PIC32MZ_HASH)
390 #include <wolfssl/wolfcrypt/port/pic32/pic32mz-crypt.h>
391
392#elif defined(STM32_HASH_SHA2)
393
394 /* Supports CubeMX HAL or Standard Peripheral Library */
395
396 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
397 {
398 if (sha256 == NULL)
399 return BAD_FUNC_ARG;
400
401 (void)devId;
402 (void)heap;
403
404 wc_Stm32_Hash_Init(&sha256->stmCtx);
405 return 0;
406 }
407
408 int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
409 {
410 int ret = 0;
411
412 if (sha256 == NULL || (data == NULL && len > 0)) {
413 return BAD_FUNC_ARG;
414 }
415
416 ret = wolfSSL_CryptHwMutexLock();
417 if (ret == 0) {
418 ret = wc_Stm32_Hash_Update(&sha256->stmCtx,
419 HASH_AlgoSelection_SHA256, data, len);
420 wolfSSL_CryptHwMutexUnLock();
421 }
422 return ret;
423 }
424
425 int wc_Sha256Final(wc_Sha256* sha256, byte* hash)
426 {
427 int ret = 0;
428
429 if (sha256 == NULL || hash == NULL) {
430 return BAD_FUNC_ARG;
431 }
432
433 ret = wolfSSL_CryptHwMutexLock();
434 if (ret == 0) {
435 ret = wc_Stm32_Hash_Final(&sha256->stmCtx,
436 HASH_AlgoSelection_SHA256, hash, WC_SHA256_DIGEST_SIZE);
437 wolfSSL_CryptHwMutexUnLock();
438 }
439
440 (void)wc_InitSha256(sha256); /* reset state */
441
442 return ret;
443 }
444
445#elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH)
446 /* functions defined in wolfcrypt/src/port/caam/caam_sha256.c */
447
448#elif defined(WOLFSSL_AFALG_HASH)
449 /* implemented in wolfcrypt/src/port/af_alg/afalg_hash.c */
450
451#elif defined(WOLFSSL_DEVCRYPTO_HASH)
452 /* implemented in wolfcrypt/src/port/devcrypto/devcrypt_hash.c */
453
454#else
455 #define NEED_SOFT_SHA256
456
457 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
458 {
459 int ret = 0;
460 if (sha256 == NULL)
461 return BAD_FUNC_ARG;
462
463 sha256->heap = heap;
464
465 ret = InitSha256(sha256);
466 if (ret != 0)
467 return ret;
468
469 #ifdef WOLFSSL_SMALL_STACK_CACHE
470 sha256->W = NULL;
471 #endif
472
473 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
474 ret = wolfAsync_DevCtxInit(&sha256->asyncDev,
475 WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId);
476 #else
477 (void)devId;
478 #endif /* WOLFSSL_ASYNC_CRYPT */
479
480 return ret;
481 }
482#endif /* End Hardware Acceleration */
483
484#ifdef NEED_SOFT_SHA256
485
486 static const ALIGN32 word32 K[64] = {
487 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
488 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
489 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
490 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
491 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
492 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
493 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
494 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
495 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
496 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
497 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
498 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
499 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
500 };
501
502 #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
503 #define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y)))
504 #define R(x, n) (((x) & 0xFFFFFFFFU) >> (n))
505
506 #define S(x, n) rotrFixed(x, n)
507 #define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
508 #define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
509 #define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
510 #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
511
512 #define a(i) S[(0-i) & 7]
513 #define b(i) S[(1-i) & 7]
514 #define c(i) S[(2-i) & 7]
515 #define d(i) S[(3-i) & 7]
516 #define e(i) S[(4-i) & 7]
517 #define f(i) S[(5-i) & 7]
518 #define g(i) S[(6-i) & 7]
519 #define h(i) S[(7-i) & 7]
520
521 #define RND(j) \
522 t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + W[i+j]; \
523 t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
524 d(j) += t0; \
525 h(j) = t0 + t1
526
527 #ifndef XTRANSFORM
528 #define XTRANSFORM(S) Transform_Sha256((S))
529 #define XTRANSFORM_LEN(S,L) Transform_Sha256_Len((S),(L))
530 #endif
531
532 static int Transform_Sha256(wc_Sha256* sha256)
533 {
534 word32 S[8], t0, t1;
535 int i;
536
537 #ifdef WOLFSSL_SMALL_STACK_CACHE
538 word32* W = sha256->W;
539 if (W == NULL) {
540 W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
541 DYNAMIC_TYPE_DIGEST);
542 if (W == NULL)
543 return MEMORY_E;
544 sha256->W = W;
545 }
546 #elif defined(WOLFSSL_SMALL_STACK)
547 word32* W;
548 W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
549 DYNAMIC_TYPE_TMP_BUFFER);
550 if (W == NULL)
551 return MEMORY_E;
552 #else
553 word32 W[WC_SHA256_BLOCK_SIZE];
554 #endif
555
556 /* Copy context->state[] to working vars */
557 for (i = 0; i < 8; i++)
558 S[i] = sha256->digest[i];
559
560 for (i = 0; i < 16; i++)
561 W[i] = sha256->buffer[i];
562
563 for (i = 16; i < WC_SHA256_BLOCK_SIZE; i++)
564 W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
565
566 #ifdef USE_SLOW_SHA256
567 /* not unrolled - ~2k smaller and ~25% slower */
568 for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
569 int j;
570 for (j = 0; j < 8; j++) { /* braces needed here for macros {} */
571 RND(j);
572 }
573 }
574 #else
575 /* partially loop unrolled */
576 for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
577 RND(0); RND(1); RND(2); RND(3);
578 RND(4); RND(5); RND(6); RND(7);
579 }
580 #endif /* USE_SLOW_SHA256 */
581
582 /* Add the working vars back into digest state[] */
583 for (i = 0; i < 8; i++) {
584 sha256->digest[i] += S[i];
585 }
586
587 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE)
588 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
589 #endif
590 return 0;
591 }
592#endif
593/* End wc_ software implementation */
594
595
596#ifdef XTRANSFORM
597
598 static WC_INLINE void AddLength(wc_Sha256* sha256, word32 len)
599{
600 word32 tmp = sha256->loLen;
601 if ( (sha256->loLen += len) < tmp)
602 sha256->hiLen++; /* carry low to high */
603}
604
605 static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
606 {
607 int ret = 0;
608 byte* local;
609
610 if (sha256 == NULL || (data == NULL && len > 0)) {
611 return BAD_FUNC_ARG;
612 }
613
614 if (data == NULL && len == 0) {
615 /* valid, but do nothing */
616 return 0;
617 }
618
619 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
620 if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) {
621 #if defined(HAVE_INTEL_QA)
622 return IntelQaSymSha256(&sha256->asyncDev, NULL, data, len);
623 #endif
624 }
625 #endif /* WOLFSSL_ASYNC_CRYPT */
626
627 /* do block size increments */
628 local = (byte*)sha256->buffer;
629
630 /* check that internal buffLen is valid */
631 if (sha256->buffLen >= WC_SHA256_BLOCK_SIZE)
632 return BUFFER_E;
633
634 if (sha256->buffLen > 0) {
635 word32 add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
636 XMEMCPY(&local[sha256->buffLen], data, add);
637
638 sha256->buffLen += add;
639 data += add;
640 len -= add;
641
642 if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) {
643 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
644 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
645 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
646 #endif
647 {
648 ByteReverseWords(sha256->buffer, sha256->buffer,
649 WC_SHA256_BLOCK_SIZE);
650 }
651 #endif
652 ret = XTRANSFORM(sha256);
653 if (ret == 0) {
654 AddLength(sha256, WC_SHA256_BLOCK_SIZE);
655 sha256->buffLen = 0;
656 }
657 else
658 len = 0;
659 }
660 }
661
662 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
663 if (Transform_Sha256_Len_p != NULL) {
664 word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1);
665
666 if (blocksLen > 0) {
667 AddLength(sha256, blocksLen);
668 sha256->data = data;
669 /* Byte reversal performed in function if required. */
670 XTRANSFORM_LEN(sha256, blocksLen);
671 data += blocksLen;
672 len -= blocksLen;
673 }
674 }
675 else
676 #endif
677 #if !defined(LITTLE_ENDIAN_ORDER) || defined(FREESCALE_MMCAU_SHA) || \
678 defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
679 {
680 word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1);
681
682 AddLength(sha256, blocksLen);
683 while (len >= WC_SHA256_BLOCK_SIZE) {
684 XMEMCPY(local, data, WC_SHA256_BLOCK_SIZE);
685
686 data += WC_SHA256_BLOCK_SIZE;
687 len -= WC_SHA256_BLOCK_SIZE;
688
689 /* Byte reversal performed in function if required. */
690 ret = XTRANSFORM(sha256);
691 if (ret != 0)
692 break;
693 }
694 }
695 #else
696 {
697 word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1);
698
699 AddLength(sha256, blocksLen);
700 while (len >= WC_SHA256_BLOCK_SIZE) {
701 XMEMCPY(local, data, WC_SHA256_BLOCK_SIZE);
702
703 data += WC_SHA256_BLOCK_SIZE;
704 len -= WC_SHA256_BLOCK_SIZE;
705
706 ByteReverseWords(sha256->buffer, sha256->buffer,
707 WC_SHA256_BLOCK_SIZE);
708 ret = XTRANSFORM(sha256);
709 if (ret != 0)
710 break;
711 }
712 }
713 #endif
714
715 if (len > 0) {
716 XMEMCPY(local, data, len);
717 sha256->buffLen = len;
718 }
719
720 return ret;
721 }
722
723 int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
724 {
725 return Sha256Update(sha256, data, len);
726 }
727
728 static WC_INLINE int Sha256Final(wc_Sha256* sha256)
729 {
730
731 int ret;
732 byte* local = (byte*)sha256->buffer;
733
734 if (sha256 == NULL) {
735 return BAD_FUNC_ARG;
736 }
737
738 AddLength(sha256, sha256->buffLen); /* before adding pads */
739 local[sha256->buffLen++] = 0x80; /* add 1 */
740
741 /* pad with zeros */
742 if (sha256->buffLen > WC_SHA256_PAD_SIZE) {
743 XMEMSET(&local[sha256->buffLen], 0,
744 WC_SHA256_BLOCK_SIZE - sha256->buffLen);
745 sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen;
746
747 {
748 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
749 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
750 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
751 #endif
752 {
753 ByteReverseWords(sha256->buffer, sha256->buffer,
754 WC_SHA256_BLOCK_SIZE);
755 }
756 #endif
757 }
758
759 ret = XTRANSFORM(sha256);
760 if (ret != 0)
761 return ret;
762
763 sha256->buffLen = 0;
764 }
765 XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_PAD_SIZE - sha256->buffLen);
766
767 /* put lengths in bits */
768 sha256->hiLen = (sha256->loLen >> (8 * sizeof(sha256->loLen) - 3)) +
769 (sha256->hiLen << 3);
770 sha256->loLen = sha256->loLen << 3;
771
772 /* store lengths */
773 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
774 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
775 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
776 #endif
777 {
778 ByteReverseWords(sha256->buffer, sha256->buffer,
779 WC_SHA256_BLOCK_SIZE);
780 }
781 #endif
782 /* ! length ordering dependent on digest endian type ! */
783 XMEMCPY(&local[WC_SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
784 XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
785 sizeof(word32));
786
787 #if defined(FREESCALE_MMCAU_SHA) || defined(HAVE_INTEL_AVX1) || \
788 defined(HAVE_INTEL_AVX2)
789 /* Kinetis requires only these bytes reversed */
790 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
791 if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
792 #endif
793 {
794 ByteReverseWords(
795 &sha256->buffer[WC_SHA256_PAD_SIZE / sizeof(word32)],
796 &sha256->buffer[WC_SHA256_PAD_SIZE / sizeof(word32)],
797 2 * sizeof(word32));
798 }
799 #endif
800
801 return XTRANSFORM(sha256);
802 }
803
804 int wc_Sha256FinalRaw(wc_Sha256* sha256, byte* hash)
805 {
806 #ifdef LITTLE_ENDIAN_ORDER
807 word32 digest[WC_SHA256_DIGEST_SIZE / sizeof(word32)];
808 #endif
809
810 if (sha256 == NULL || hash == NULL) {
811 return BAD_FUNC_ARG;
812 }
813
814 #ifdef LITTLE_ENDIAN_ORDER
815 ByteReverseWords((word32*)digest, (word32*)sha256->digest,
816 WC_SHA256_DIGEST_SIZE);
817 XMEMCPY(hash, digest, WC_SHA256_DIGEST_SIZE);
818 #else
819 XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
820 #endif
821
822 return 0;
823 }
824
825 int wc_Sha256Final(wc_Sha256* sha256, byte* hash)
826 {
827 int ret;
828
829 if (sha256 == NULL || hash == NULL) {
830 return BAD_FUNC_ARG;
831 }
832
833 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
834 if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) {
835 #if defined(HAVE_INTEL_QA)
836 return IntelQaSymSha256(&sha256->asyncDev, hash, NULL,
837 WC_SHA256_DIGEST_SIZE);
838 #endif
839 }
840 #endif /* WOLFSSL_ASYNC_CRYPT */
841
842 ret = Sha256Final(sha256);
843 if (ret != 0)
844 return ret;
845
846 #if defined(LITTLE_ENDIAN_ORDER)
847 ByteReverseWords(sha256->digest, sha256->digest, WC_SHA256_DIGEST_SIZE);
848 #endif
849 XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
850
851 return InitSha256(sha256); /* reset state */
852 }
853
854#endif /* XTRANSFORM */
855
856
857#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
858
859#define _LOAD_DIGEST() \
860 "movl (%[sha256]), %%r8d \n\t" \
861 "movl 4(%[sha256]), %%r9d \n\t" \
862 "movl 8(%[sha256]), %%r10d\n\t" \
863 "movl 12(%[sha256]), %%r11d\n\t" \
864 "movl 16(%[sha256]), %%r12d\n\t" \
865 "movl 20(%[sha256]), %%r13d\n\t" \
866 "movl 24(%[sha256]), %%r14d\n\t" \
867 "movl 28(%[sha256]), %%r15d\n\t"
868
869#define _STORE_ADD_DIGEST() \
870 "addl %%r8d , (%[sha256])\n\t" \
871 "addl %%r9d , 4(%[sha256])\n\t" \
872 "addl %%r10d, 8(%[sha256])\n\t" \
873 "addl %%r11d, 12(%[sha256])\n\t" \
874 "addl %%r12d, 16(%[sha256])\n\t" \
875 "addl %%r13d, 20(%[sha256])\n\t" \
876 "addl %%r14d, 24(%[sha256])\n\t" \
877 "addl %%r15d, 28(%[sha256])\n\t"
878
879#define _ADD_DIGEST() \
880 "addl (%[sha256]), %%r8d \n\t" \
881 "addl 4(%[sha256]), %%r9d \n\t" \
882 "addl 8(%[sha256]), %%r10d\n\t" \
883 "addl 12(%[sha256]), %%r11d\n\t" \
884 "addl 16(%[sha256]), %%r12d\n\t" \
885 "addl 20(%[sha256]), %%r13d\n\t" \
886 "addl 24(%[sha256]), %%r14d\n\t" \
887 "addl 28(%[sha256]), %%r15d\n\t"
888
889#define _STORE_DIGEST() \
890 "movl %%r8d , (%[sha256])\n\t" \
891 "movl %%r9d , 4(%[sha256])\n\t" \
892 "movl %%r10d, 8(%[sha256])\n\t" \
893 "movl %%r11d, 12(%[sha256])\n\t" \
894 "movl %%r12d, 16(%[sha256])\n\t" \
895 "movl %%r13d, 20(%[sha256])\n\t" \
896 "movl %%r14d, 24(%[sha256])\n\t" \
897 "movl %%r15d, 28(%[sha256])\n\t"
898
899#define LOAD_DIGEST() \
900 _LOAD_DIGEST()
901
902#define STORE_ADD_DIGEST() \
903 _STORE_ADD_DIGEST()
904
905#define ADD_DIGEST() \
906 _ADD_DIGEST()
907
908#define STORE_DIGEST() \
909 _STORE_DIGEST()
910
911
912#define S_0 %r8d
913#define S_1 %r9d
914#define S_2 %r10d
915#define S_3 %r11d
916#define S_4 %r12d
917#define S_5 %r13d
918#define S_6 %r14d
919#define S_7 %r15d
920
921#define L1 "%%edx"
922#define L2 "%%ecx"
923#define L3 "%%eax"
924#define L4 "%%ebx"
925#define WK "%%rsp"
926
927#define WORK_REGS "eax", "ebx", "ecx", "edx"
928#define STATE_REGS "r8","r9","r10","r11","r12","r13","r14","r15"
929#define XMM_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", \
930 "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13"
931
932#if defined(HAVE_INTEL_RORX)
933#define RND_STEP_RORX_0_1(a, b, c, d, e, f, g, h, i) \
934 /* L3 = f */ \
935 "movl %" #f ", " L3 "\n\t" \
936 /* L2 = e>>>11 */ \
937 "rorx $11, %" #e ", " L2 "\n\t" \
938 /* h += w_k */ \
939 "addl (" #i ")*4(" WK "), %" #h "\n\t" \
940
941#define RND_STEP_RORX_0_2(a, b, c, d, e, f, g, h, i) \
942 /* L2 = (e>>>6) ^ (e>>>11) */ \
943 "xorl " L1 ", " L2 "\n\t" \
944 /* L3 = f ^ g */ \
945 "xorl %" #g ", " L3 "\n\t" \
946 /* L1 = e>>>25 */ \
947 "rorx $25, %" #e ", " L1 "\n\t" \
948
949#define RND_STEP_RORX_0_3(a, b, c, d, e, f, g, h, i) \
950 /* L3 = (f ^ g) & e */ \
951 "andl %" #e ", " L3 "\n\t" \
952 /* L1 = Sigma1(e) */ \
953 "xorl " L2 ", " L1 "\n\t" \
954 /* L2 = a>>>13 */ \
955 "rorx $13, %" #a ", " L2 "\n\t" \
956
957#define RND_STEP_RORX_0_4(a, b, c, d, e, f, g, h, i) \
958 /* h += Sigma1(e) */ \
959 "addl " L1 ", %" #h "\n\t" \
960 /* L1 = a>>>2 */ \
961 "rorx $2, %" #a ", " L1 "\n\t" \
962 /* L3 = Ch(e,f,g) */ \
963 "xorl %" #g ", " L3 "\n\t" \
964
965#define RND_STEP_RORX_0_5(a, b, c, d, e, f, g, h, i) \
966 /* L2 = (a>>>2) ^ (a>>>13) */ \
967 "xorl " L1 ", " L2 "\n\t" \
968 /* L1 = a>>>22 */ \
969 "rorx $22, %" #a ", " L1 "\n\t" \
970 /* h += Ch(e,f,g) */ \
971 "addl " L3 ", %" #h "\n\t" \
972
973#define RND_STEP_RORX_0_6(a, b, c, d, e, f, g, h, i) \
974 /* L1 = Sigma0(a) */ \
975 "xorl " L2 ", " L1 "\n\t" \
976 /* L3 = b */ \
977 "movl %" #b ", " L3 "\n\t" \
978 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
979 "addl %" #h ", %" #d "\n\t" \
980
981#define RND_STEP_RORX_0_7(a, b, c, d, e, f, g, h, i) \
982 /* L3 = a ^ b */ \
983 "xorl %" #a ", " L3 "\n\t" \
984 /* h += Sigma0(a) */ \
985 "addl " L1 ", %" #h "\n\t" \
986 /* L4 = (a ^ b) & (b ^ c) */ \
987 "andl " L3 ", " L4 "\n\t" \
988
989#define RND_STEP_RORX_0_8(a, b, c, d, e, f, g, h, i) \
990 /* L4 = Maj(a,b,c) */ \
991 "xorl %" #b ", " L4 "\n\t" \
992 /* L1 = d>>>6 (= e>>>6 next RND) */ \
993 "rorx $6, %" #d ", " L1 "\n\t" \
994 /* h += Maj(a,b,c) */ \
995 "addl " L4 ", %" #h "\n\t" \
996
997#define RND_STEP_RORX_1_1(a, b, c, d, e, f, g, h, i) \
998 /* L4 = f */ \
999 "movl %" #f ", " L4 "\n\t" \
1000 /* L2 = e>>>11 */ \
1001 "rorx $11, %" #e ", " L2 "\n\t" \
1002 /* h += w_k */ \
1003 "addl (" #i ")*4(" WK "), %" #h "\n\t" \
1004
1005#define RND_STEP_RORX_1_2(a, b, c, d, e, f, g, h, i) \
1006 /* L2 = (e>>>6) ^ (e>>>11) */ \
1007 "xorl " L1 ", " L2 "\n\t" \
1008 /* L4 = f ^ g */ \
1009 "xorl %" #g ", " L4 "\n\t" \
1010 /* L1 = e>>>25 */ \
1011 "rorx $25, %" #e ", " L1 "\n\t" \
1012
1013#define RND_STEP_RORX_1_3(a, b, c, d, e, f, g, h, i) \
1014 /* L4 = (f ^ g) & e */ \
1015 "andl %" #e ", " L4 "\n\t" \
1016 /* L1 = Sigma1(e) */ \
1017 "xorl " L2 ", " L1 "\n\t" \
1018 /* L2 = a>>>13 */ \
1019 "rorx $13, %" #a ", " L2 "\n\t" \
1020
1021#define RND_STEP_RORX_1_4(a, b, c, d, e, f, g, h, i) \
1022 /* h += Sigma1(e) */ \
1023 "addl " L1 ", %" #h "\n\t" \
1024 /* L1 = a>>>2 */ \
1025 "rorx $2, %" #a ", " L1 "\n\t" \
1026 /* L4 = Ch(e,f,g) */ \
1027 "xorl %" #g ", " L4 "\n\t" \
1028
1029#define RND_STEP_RORX_1_5(a, b, c, d, e, f, g, h, i) \
1030 /* L2 = (a>>>2) ^ (a>>>13) */ \
1031 "xorl " L1 ", " L2 "\n\t" \
1032 /* L1 = a>>>22 */ \
1033 "rorx $22, %" #a ", " L1 "\n\t" \
1034 /* h += Ch(e,f,g) */ \
1035 "addl " L4 ", %" #h "\n\t" \
1036
1037#define RND_STEP_RORX_1_6(a, b, c, d, e, f, g, h, i) \
1038 /* L1 = Sigma0(a) */ \
1039 "xorl " L2 ", " L1 "\n\t" \
1040 /* L4 = b */ \
1041 "movl %" #b ", " L4 "\n\t" \
1042 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
1043 "addl %"#h", %"#d"\n\t" \
1044
1045#define RND_STEP_RORX_1_7(a, b, c, d, e, f, g, h, i) \
1046 /* L4 = a ^ b */ \
1047 "xorl %" #a ", " L4 "\n\t" \
1048 /* h += Sigma0(a) */ \
1049 "addl " L1 ", %" #h "\n\t" \
1050 /* L3 = (a ^ b) & (b ^ c) */ \
1051 "andl " L4 ", " L3 "\n\t" \
1052
1053#define RND_STEP_RORX_1_8(a, b, c, d, e, f, g, h, i) \
1054 /* L3 = Maj(a,b,c) */ \
1055 "xorl %" #b ", " L3 "\n\t" \
1056 /* L1 = d>>>6 (= e>>>6 next RND) */ \
1057 "rorx $6, %" #d ", " L1 "\n\t" \
1058 /* h += Maj(a,b,c) */ \
1059 "addl " L3 ", %" #h "\n\t" \
1060
1061#define _RND_RORX_X_0(a, b, c, d, e, f, g, h, i) \
1062 /* L1 = e>>>6 */ \
1063 "rorx $6, %" #e ", " L1 "\n\t" \
1064 /* L2 = e>>>11 */ \
1065 "rorx $11, %" #e ", " L2 "\n\t" \
1066 /* Prev RND: h += Maj(a,b,c) */ \
1067 "addl " L3 ", %" #a "\n\t" \
1068 /* h += w_k */ \
1069 "addl (" #i ")*4(" WK "), %" #h "\n\t" \
1070 /* L3 = f */ \
1071 "movl %" #f ", " L3 "\n\t" \
1072 /* L2 = (e>>>6) ^ (e>>>11) */ \
1073 "xorl " L1 ", " L2 "\n\t" \
1074 /* L3 = f ^ g */ \
1075 "xorl %" #g ", " L3 "\n\t" \
1076 /* L1 = e>>>25 */ \
1077 "rorx $25, %" #e ", " L1 "\n\t" \
1078 /* L1 = Sigma1(e) */ \
1079 "xorl " L2 ", " L1 "\n\t" \
1080 /* L3 = (f ^ g) & e */ \
1081 "andl %" #e ", " L3 "\n\t" \
1082 /* h += Sigma1(e) */ \
1083 "addl " L1 ", %" #h "\n\t" \
1084 /* L1 = a>>>2 */ \
1085 "rorx $2, %" #a ", " L1 "\n\t" \
1086 /* L2 = a>>>13 */ \
1087 "rorx $13, %" #a ", " L2 "\n\t" \
1088 /* L3 = Ch(e,f,g) */ \
1089 "xorl %" #g ", " L3 "\n\t" \
1090 /* L2 = (a>>>2) ^ (a>>>13) */ \
1091 "xorl " L1 ", " L2 "\n\t" \
1092 /* L1 = a>>>22 */ \
1093 "rorx $22, %" #a ", " L1 "\n\t" \
1094 /* h += Ch(e,f,g) */ \
1095 "addl " L3 ", %" #h "\n\t" \
1096 /* L1 = Sigma0(a) */ \
1097 "xorl " L2 ", " L1 "\n\t" \
1098 /* L3 = b */ \
1099 "movl %" #b ", " L3 "\n\t" \
1100 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
1101 "addl %" #h ", %" #d "\n\t" \
1102 /* L3 = a ^ b */ \
1103 "xorl %" #a ", " L3 "\n\t" \
1104 /* L4 = (a ^ b) & (b ^ c) */ \
1105 "andl " L3 ", " L4 "\n\t" \
1106 /* h += Sigma0(a) */ \
1107 "addl " L1 ", %" #h "\n\t" \
1108 /* L4 = Maj(a,b,c) */ \
1109 "xorl %" #b ", " L4 "\n\t" \
1110
1111#define _RND_RORX_X_1(a, b, c, d, e, f, g, h, i) \
1112 /* L1 = e>>>6 */ \
1113 "rorx $6, %" #e ", " L1 "\n\t" \
1114 /* L2 = e>>>11 */ \
1115 "rorx $11, %" #e ", " L2 "\n\t" \
1116 /* Prev RND: h += Maj(a,b,c) */ \
1117 "addl " L4 ", %" #a "\n\t" \
1118 /* h += w_k */ \
1119 "addl (" #i ")*4(" WK "), %" #h "\n\t" \
1120 /* L4 = f */ \
1121 "movl %" #f ", " L4 "\n\t" \
1122 /* L2 = (e>>>6) ^ (e>>>11) */ \
1123 "xorl " L1 ", " L2 "\n\t" \
1124 /* L4 = f ^ g */ \
1125 "xorl %" #g ", " L4 "\n\t" \
1126 /* L1 = e>>>25 */ \
1127 "rorx $25, %" #e ", " L1 "\n\t" \
1128 /* L1 = Sigma1(e) */ \
1129 "xorl " L2 ", " L1 "\n\t" \
1130 /* L4 = (f ^ g) & e */ \
1131 "andl %" #e ", " L4 "\n\t" \
1132 /* h += Sigma1(e) */ \
1133 "addl " L1 ", %" #h "\n\t" \
1134 /* L1 = a>>>2 */ \
1135 "rorx $2, %" #a ", " L1 "\n\t" \
1136 /* L2 = a>>>13 */ \
1137 "rorx $13, %" #a ", " L2 "\n\t" \
1138 /* L4 = Ch(e,f,g) */ \
1139 "xorl %" #g ", " L4 "\n\t" \
1140 /* L2 = (a>>>2) ^ (a>>>13) */ \
1141 "xorl " L1 ", " L2 "\n\t" \
1142 /* L1 = a>>>22 */ \
1143 "rorx $22, %" #a ", " L1 "\n\t" \
1144 /* h += Ch(e,f,g) */ \
1145 "addl " L4 ", %" #h "\n\t" \
1146 /* L1 = Sigma0(a) */ \
1147 "xorl " L2 ", " L1 "\n\t" \
1148 /* L4 = b */ \
1149 "movl %" #b ", " L4 "\n\t" \
1150 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
1151 "addl %" #h ", %" #d "\n\t" \
1152 /* L4 = a ^ b */ \
1153 "xorl %" #a ", " L4 "\n\t" \
1154 /* L2 = (a ^ b) & (b ^ c) */ \
1155 "andl " L4 ", " L3 "\n\t" \
1156 /* h += Sigma0(a) */ \
1157 "addl " L1 ", %" #h "\n\t" \
1158 /* L3 = Maj(a,b,c) */ \
1159 "xorl %" #b ", " L3 "\n\t" \
1160
1161
1162#define RND_RORX_X_0(a,b,c,d,e,f,g,h,i) \
1163 _RND_RORX_X_0(a,b,c,d,e,f,g,h,i)
1164#define RND_RORX_X_1(a,b,c,d,e,f,g,h,i) \
1165 _RND_RORX_X_1(a,b,c,d,e,f,g,h,i)
1166
1167#define RND_RORX_X4(a,b,c,d,e,f,g,h,i) \
1168 RND_RORX_X_0(a,b,c,d,e,f,g,h,i+0) \
1169 RND_RORX_X_1(h,a,b,c,d,e,f,g,i+1) \
1170 RND_RORX_X_0(g,h,a,b,c,d,e,f,i+2) \
1171 RND_RORX_X_1(f,g,h,a,b,c,d,e,i+3)
1172
1173#endif /* HAVE_INTEL_RORX */
1174
1175#define RND_STEP_0_1(a,b,c,d,e,f,g,h,i) \
1176 /* L1 = e>>>14 */ \
1177 "rorl $14, " L1 "\n\t" \
1178
1179#define RND_STEP_0_2(a,b,c,d,e,f,g,h,i) \
1180 /* L3 = b */ \
1181 "movl %" #b ", " L3 "\n\t" \
1182 /* L2 = f */ \
1183 "movl %" #f ", " L2 "\n\t" \
1184 /* h += w_k */ \
1185 "addl (" #i ")*4(" WK "), %" #h "\n\t" \
1186 /* L2 = f ^ g */ \
1187 "xorl %" #g ", " L2 "\n\t" \
1188
1189#define RND_STEP_0_3(a,b,c,d,e,f,g,h,i) \
1190 /* L1 = (e>>>14) ^ e */ \
1191 "xorl %" #e ", " L1 "\n\t" \
1192 /* L2 = (f ^ g) & e */ \
1193 "andl %" #e ", " L2 "\n\t" \
1194
1195#define RND_STEP_0_4(a,b,c,d,e,f,g,h,i) \
1196 /* L1 = ((e>>>14) ^ e) >>> 5 */ \
1197 "rorl $5, " L1 "\n\t" \
1198 /* L2 = Ch(e,f,g) */ \
1199 "xorl %" #g ", " L2 "\n\t" \
1200 /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \
1201 "xorl %" #e ", " L1 "\n\t" \
1202 /* h += Ch(e,f,g) */ \
1203 "addl " L2 ", %" #h "\n\t" \
1204
1205#define RND_STEP_0_5(a,b,c,d,e,f,g,h,i) \
1206 /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \
1207 "rorl $6, " L1 "\n\t" \
1208 /* L3 = a ^ b (= b ^ c of next RND) */ \
1209 "xorl %" #a ", " L3 "\n\t" \
1210 /* h = h + w_k + Sigma1(e) */ \
1211 "addl " L1 ", %" #h "\n\t" \
1212 /* L2 = a */ \
1213 "movl %" #a ", " L2 "\n\t" \
1214
1215#define RND_STEP_0_6(a,b,c,d,e,f,g,h,i) \
1216 /* L3 = (a ^ b) & (b ^ c) */ \
1217 "andl " L3 ", " L4 "\n\t" \
1218 /* L2 = a>>>9 */ \
1219 "rorl $9, " L2 "\n\t" \
1220 /* L2 = (a>>>9) ^ a */ \
1221 "xorl %" #a ", " L2 "\n\t" \
1222 /* L1 = Maj(a,b,c) */ \
1223 "xorl %" #b ", " L4 "\n\t" \
1224
1225#define RND_STEP_0_7(a,b,c,d,e,f,g,h,i) \
1226 /* L2 = ((a>>>9) ^ a) >>> 11 */ \
1227 "rorl $11, " L2 "\n\t" \
1228 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
1229 "addl %" #h ", %" #d "\n\t" \
1230 /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \
1231 "xorl %" #a ", " L2 "\n\t" \
1232 /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \
1233 "addl " L4 ", %" #h "\n\t" \
1234
1235#define RND_STEP_0_8(a,b,c,d,e,f,g,h,i) \
1236 /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \
1237 "rorl $2, " L2 "\n\t" \
1238 /* L1 = d (e of next RND) */ \
1239 "movl %" #d ", " L1 "\n\t" \
1240 /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \
1241 "addl " L2 ", %" #h "\n\t" \
1242
1243#define RND_STEP_1_1(a,b,c,d,e,f,g,h,i) \
1244 /* L1 = e>>>14 */ \
1245 "rorl $14, " L1 "\n\t" \
1246
1247#define RND_STEP_1_2(a,b,c,d,e,f,g,h,i) \
1248 /* L3 = b */ \
1249 "movl %" #b ", " L4 "\n\t" \
1250 /* L2 = f */ \
1251 "movl %" #f ", " L2 "\n\t" \
1252 /* h += w_k */ \
1253 "addl (" #i ")*4(" WK "), %" #h "\n\t" \
1254 /* L2 = f ^ g */ \
1255 "xorl %" #g ", " L2 "\n\t" \
1256
1257#define RND_STEP_1_3(a,b,c,d,e,f,g,h,i) \
1258 /* L1 = (e>>>14) ^ e */ \
1259 "xorl %" #e ", " L1 "\n\t" \
1260 /* L2 = (f ^ g) & e */ \
1261 "andl %" #e ", " L2 "\n\t" \
1262
1263#define RND_STEP_1_4(a,b,c,d,e,f,g,h,i) \
1264 /* L1 = ((e>>>14) ^ e) >>> 5 */ \
1265 "rorl $5, " L1 "\n\t" \
1266 /* L2 = Ch(e,f,g) */ \
1267 "xorl %" #g ", " L2 "\n\t" \
1268 /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \
1269 "xorl %" #e ", " L1 "\n\t" \
1270 /* h += Ch(e,f,g) */ \
1271 "addl " L2 ", %" #h "\n\t" \
1272
1273#define RND_STEP_1_5(a,b,c,d,e,f,g,h,i) \
1274 /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \
1275 "rorl $6, " L1 "\n\t" \
1276 /* L4 = a ^ b (= b ^ c of next RND) */ \
1277 "xorl %" #a ", " L4 "\n\t" \
1278 /* h = h + w_k + Sigma1(e) */ \
1279 "addl " L1 ", %" #h "\n\t" \
1280 /* L2 = a */ \
1281 "movl %" #a ", " L2 "\n\t" \
1282
1283#define RND_STEP_1_6(a,b,c,d,e,f,g,h,i) \
1284 /* L3 = (a ^ b) & (b ^ c) */ \
1285 "andl " L4 ", " L3 "\n\t" \
1286 /* L2 = a>>>9 */ \
1287 "rorl $9, " L2 "\n\t" \
1288 /* L2 = (a>>>9) ^ a */ \
1289 "xorl %" #a ", " L2 "\n\t" \
1290 /* L1 = Maj(a,b,c) */ \
1291 "xorl %" #b ", " L3 "\n\t" \
1292
1293#define RND_STEP_1_7(a,b,c,d,e,f,g,h,i) \
1294 /* L2 = ((a>>>9) ^ a) >>> 11 */ \
1295 "rorl $11, " L2 "\n\t" \
1296 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
1297 "addl %" #h ", %" #d "\n\t" \
1298 /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \
1299 "xorl %" #a ", " L2 "\n\t" \
1300 /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \
1301 "addl " L3 ", %" #h "\n\t" \
1302
1303#define RND_STEP_1_8(a,b,c,d,e,f,g,h,i) \
1304 /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \
1305 "rorl $2, " L2 "\n\t" \
1306 /* L1 = d (e of next RND) */ \
1307 "movl %" #d ", " L1 "\n\t" \
1308 /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \
1309 "addl " L2 ", %" #h "\n\t" \
1310
1311#define _RND_ALL_0(a,b,c,d,e,f,g,h,i) \
1312 /* h += w_k */ \
1313 "addl (" #i ")*4(" WK "), %" #h "\n\t" \
1314 /* L2 = f */ \
1315 "movl %" #f ", " L2 "\n\t" \
1316 /* L3 = b */ \
1317 "movl %" #b ", " L3 "\n\t" \
1318 /* L2 = f ^ g */ \
1319 "xorl %" #g ", " L2 "\n\t" \
1320 /* L1 = e>>>14 */ \
1321 "rorl $14, " L1 "\n\t" \
1322 /* L2 = (f ^ g) & e */ \
1323 "andl %" #e ", " L2 "\n\t" \
1324 /* L1 = (e>>>14) ^ e */ \
1325 "xorl %" #e ", " L1 "\n\t" \
1326 /* L2 = Ch(e,f,g) */ \
1327 "xorl %" #g ", " L2 "\n\t" \
1328 /* L1 = ((e>>>14) ^ e) >>> 5 */ \
1329 "rorl $5, " L1 "\n\t" \
1330 /* h += Ch(e,f,g) */ \
1331 "addl " L2 ", %" #h "\n\t" \
1332 /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \
1333 "xorl %" #e ", " L1 "\n\t" \
1334 /* L3 = a ^ b */ \
1335 "xorl %" #a ", " L3 "\n\t" \
1336 /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \
1337 "rorl $6, " L1 "\n\t" \
1338 /* L2 = a */ \
1339 "movl %" #a ", " L2 "\n\t" \
1340 /* h = h + w_k + Sigma1(e) */ \
1341 "addl " L1 ", %" #h "\n\t" \
1342 /* L2 = a>>>9 */ \
1343 "rorl $9, " L2 "\n\t" \
1344 /* L3 = (a ^ b) & (b ^ c) */ \
1345 "andl " L3 ", " L4 "\n\t" \
1346 /* L2 = (a>>>9) ^ a */ \
1347 "xorl %" #a ", " L2 "\n\t" \
1348 /* L1 = Maj(a,b,c) */ \
1349 "xorl %" #b ", " L4 "\n\t" \
1350 /* L2 = ((a>>>9) ^ a) >>> 11 */ \
1351 "rorl $11, " L2 "\n\t" \
1352 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
1353 "addl %" #h ", %" #d "\n\t" \
1354 /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \
1355 "xorl %" #a ", " L2 "\n\t" \
1356 /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \
1357 "addl " L4 ", %" #h "\n\t" \
1358 /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \
1359 "rorl $2, " L2 "\n\t" \
1360 /* L1 = d (e of next RND) */ \
1361 "movl %" #d ", " L1 "\n\t" \
1362 /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \
1363 "addl " L2 ", %" #h "\n\t" \
1364
1365#define _RND_ALL_1(a,b,c,d,e,f,g,h,i) \
1366 /* h += w_k */ \
1367 "addl (" #i ")*4(" WK "), %" #h "\n\t" \
1368 /* L2 = f */ \
1369 "movl %" #f ", " L2 "\n\t" \
1370 /* L3 = b */ \
1371 "movl %" #b ", " L4 "\n\t" \
1372 /* L2 = f ^ g */ \
1373 "xorl %" #g ", " L2 "\n\t" \
1374 /* L1 = e>>>14 */ \
1375 "rorl $14, " L1 "\n\t" \
1376 /* L2 = (f ^ g) & e */ \
1377 "andl %" #e ", " L2 "\n\t" \
1378 /* L1 = (e>>>14) ^ e */ \
1379 "xorl %" #e ", " L1 "\n\t" \
1380 /* L2 = Ch(e,f,g) */ \
1381 "xorl %" #g ", " L2 "\n\t" \
1382 /* L1 = ((e>>>14) ^ e) >>> 5 */ \
1383 "rorl $5, " L1 "\n\t" \
1384 /* h += Ch(e,f,g) */ \
1385 "addl " L2 ", %" #h "\n\t" \
1386 /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \
1387 "xorl %" #e ", " L1 "\n\t" \
1388 /* L3 = a ^ b */ \
1389 "xorl %" #a ", " L4 "\n\t" \
1390 /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \
1391 "rorl $6, " L1 "\n\t" \
1392 /* L2 = a */ \
1393 "movl %" #a ", " L2 "\n\t" \
1394 /* h = h + w_k + Sigma1(e) */ \
1395 "addl " L1 ", %" #h "\n\t" \
1396 /* L2 = a>>>9 */ \
1397 "rorl $9, " L2 "\n\t" \
1398 /* L3 = (a ^ b) & (b ^ c) */ \
1399 "andl " L4 ", " L3 "\n\t" \
1400 /* L2 = (a>>>9) ^ a */ \
1401 "xorl %" #a", " L2 "\n\t" \
1402 /* L1 = Maj(a,b,c) */ \
1403 "xorl %" #b ", " L3 "\n\t" \
1404 /* L2 = ((a>>>9) ^ a) >>> 11 */ \
1405 "rorl $11, " L2 "\n\t" \
1406 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
1407 "addl %"#h", %"#d"\n\t" \
1408 /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \
1409 "xorl %" #a ", " L2 "\n\t" \
1410 /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \
1411 "addl " L3 ", %" #h "\n\t" \
1412 /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \
1413 "rorl $2, " L2 "\n\t" \
1414 /* L1 = d (e of next RND) */ \
1415 "movl %" #d ", " L1 "\n\t" \
1416 /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \
1417 "addl " L2 ", %" #h "\n\t" \
1418
1419
1420#define RND_ALL_0(a, b, c, d, e, f, g, h, i) \
1421 _RND_ALL_0(a, b, c, d, e, f, g, h, i)
1422#define RND_ALL_1(a, b, c, d, e, f, g, h, i) \
1423 _RND_ALL_1(a, b, c, d, e, f, g, h, i)
1424
1425#define RND_ALL_4(a, b, c, d, e, f, g, h, i) \
1426 RND_ALL_0(a, b, c, d, e, f, g, h, i+0) \
1427 RND_ALL_1(h, a, b, c, d, e, f, g, i+1) \
1428 RND_ALL_0(g, h, a, b, c, d, e, f, i+2) \
1429 RND_ALL_1(f, g, h, a, b, c, d, e, i+3)
1430
1431#endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */
1432
1433#if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */
1434
1435#define _VPALIGNR(op1, op2, op3, op4) \
1436 "vpalignr $"#op4", %"#op3", %"#op2", %"#op1"\n\t"
1437#define VPALIGNR(op1, op2, op3, op4) \
1438 _VPALIGNR(op1, op2, op3, op4)
1439#define _VPADDD(op1, op2, op3) \
1440 "vpaddd %"#op3", %"#op2", %"#op1"\n\t"
1441#define VPADDD(op1, op2, op3) \
1442 _VPADDD(op1, op2, op3)
1443#define _VPSRLD(op1, op2, op3) \
1444 "vpsrld $"#op3", %"#op2", %"#op1"\n\t"
1445#define VPSRLD(op1, op2, op3) \
1446 _VPSRLD(op1, op2, op3)
1447#define _VPSRLQ(op1, op2, op3) \
1448 "vpsrlq $"#op3", %"#op2", %"#op1"\n\t"
1449#define VPSRLQ(op1,op2,op3) \
1450 _VPSRLQ(op1,op2,op3)
1451#define _VPSLLD(op1,op2,op3) \
1452 "vpslld $"#op3", %"#op2", %"#op1"\n\t"
1453#define VPSLLD(op1,op2,op3) \
1454 _VPSLLD(op1,op2,op3)
1455#define _VPOR(op1,op2,op3) \
1456 "vpor %"#op3", %"#op2", %"#op1"\n\t"
1457#define VPOR(op1,op2,op3) \
1458 _VPOR(op1,op2,op3)
1459#define _VPXOR(op1,op2,op3) \
1460 "vpxor %"#op3", %"#op2", %"#op1"\n\t"
1461#define VPXOR(op1,op2,op3) \
1462 _VPXOR(op1,op2,op3)
1463#define _VPSHUFD(op1,op2,op3) \
1464 "vpshufd $"#op3", %"#op2", %"#op1"\n\t"
1465#define VPSHUFD(op1,op2,op3) \
1466 _VPSHUFD(op1,op2,op3)
1467#define _VPSHUFB(op1,op2,op3) \
1468 "vpshufb %"#op3", %"#op2", %"#op1"\n\t"
1469#define VPSHUFB(op1,op2,op3) \
1470 _VPSHUFB(op1,op2,op3)
1471#define _VPSLLDQ(op1,op2,op3) \
1472 "vpslldq $" #op3", %" #op2", %" #op1"\n\t"
1473#define VPSLLDQ(op1,op2,op3) \
1474 _VPSLLDQ(op1,op2,op3)
1475
1476#define MsgSched(X0,X1,X2,X3,a,b,c,d,e,f,g,h,_i) \
1477 RND_STEP_0_1(a,b,c,d,e,f,g,h,_i) \
1478 VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */\
1479 VPALIGNR (XTMP0, X3, X2, 4) /* XTMP0 = W[-7] */ \
1480 RND_STEP_0_2(a,b,c,d,e,f,g,h,_i) \
1481 RND_STEP_0_3(a,b,c,d,e,f,g,h,_i) \
1482 VPSRLD (XTMP2, XTMP1, 7) /* XTMP2 = W[-15] >> 7 */ \
1483 VPSLLD (XTMP3, XTMP1, 25) /* XTEMP3 = W[-15] << (32-7) */ \
1484 RND_STEP_0_4(a,b,c,d,e,f,g,h,_i) \
1485 RND_STEP_0_5(a,b,c,d,e,f,g,h,_i) \
1486 VPSRLD (XTMP4, XTMP1, 18) /* XTEMP4 = W[-15] >> 18 */ \
1487 VPSLLD (XTMP5, XTMP1, 14) /* XTEMP5 = W[-15] << (32-18) */ \
1488 RND_STEP_0_6(a,b,c,d,e,f,g,h,_i) \
1489 RND_STEP_0_7(a,b,c,d,e,f,g,h,_i) \
1490 VPOR (XTMP2, XTMP3, XTMP2) /* XTMP2 = W[-15] >>> 7 */ \
1491 VPOR (XTMP4, XTMP5, XTMP4) /* XTMP4 = W[-15] >>> 18 */ \
1492 RND_STEP_0_8(a,b,c,d,e,f,g,h,_i) \
1493 RND_STEP_1_1(h,a,b,c,d,e,f,g,_i+1) \
1494 RND_STEP_1_2(h,a,b,c,d,e,f,g,_i+1) \
1495 VPSRLD (XTMP5, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */ \
1496 VPXOR (XTMP2, XTMP4, XTMP2) \
1497 /* XTMP2 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */ \
1498 RND_STEP_1_3(h,a,b,c,d,e,f,g,_i+1) \
1499 RND_STEP_1_4(h,a,b,c,d,e,f,g,_i+1) \
1500 VPXOR (XTMP1, XTMP5, XTMP2) /* XTMP1 = s0 */ \
1501 VPSHUFD(XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/\
1502 RND_STEP_1_5(h,a,b,c,d,e,f,g,_i+1) \
1503 RND_STEP_1_6(h,a,b,c,d,e,f,g,_i+1) \
1504 VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */\
1505 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
1506 RND_STEP_1_7(h,a,b,c,d,e,f,g,_i+1) \
1507 RND_STEP_1_8(h,a,b,c,d,e,f,g,_i+1) \
1508 RND_STEP_0_1(g,h,a,b,c,d,e,f,_i+2) \
1509 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
1510 VPADDD (XTMP0, XTMP0, X0) \
1511 RND_STEP_0_2(g,h,a,b,c,d,e,f,_i+2) \
1512 RND_STEP_0_3(g,h,a,b,c,d,e,f,_i+2) \
1513 RND_STEP_0_4(g,h,a,b,c,d,e,f,_i+2) \
1514 VPXOR (XTMP2, XTMP3, XTMP2) \
1515 VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */ \
1516 RND_STEP_0_5(g,h,a,b,c,d,e,f,_i+2) \
1517 VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */\
1518 RND_STEP_0_6(g,h,a,b,c,d,e,f,_i+2) \
1519 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */\
1520 RND_STEP_0_7(g,h,a,b,c,d,e,f,_i+2) \
1521 VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */\
1522 RND_STEP_0_8(g,h,a,b,c,d,e,f,_i+2) \
1523 RND_STEP_1_1(f,g,h,a,b,c,d,e,_i+3) \
1524 VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */\
1525 RND_STEP_1_2(f,g,h,a,b,c,d,e,_i+3) \
1526 VPSRLQ (XTMP4, XTMP2, 17) /* XTMP4 = W[-2] MY_ROR 17 {xDxC} */ \
1527 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
1528 RND_STEP_1_3(f,g,h,a,b,c,d,e,_i+3) \
1529 RND_STEP_1_4(f,g,h,a,b,c,d,e,_i+3) \
1530 VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */ \
1531 VPXOR (XTMP4, XTMP3, XTMP4) \
1532 RND_STEP_1_5(f,g,h,a,b,c,d,e,_i+3) \
1533 RND_STEP_1_6(f,g,h,a,b,c,d,e,_i+3) \
1534 VPXOR (XTMP5, XTMP4, XTMP5) /* XTMP5 = s1 {xDxC} */ \
1535 RND_STEP_1_7(f,g,h,a,b,c,d,e,_i+3) \
1536 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */\
1537 RND_STEP_1_8(f,g,h,a,b,c,d,e,_i+3) \
1538 VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */
1539
1540#if defined(HAVE_INTEL_RORX)
1541
1542#define MsgSched_RORX(X0,X1,X2,X3,a,b,c,d,e,f,g,h,_i) \
1543 RND_STEP_RORX_0_1(a,b,c,d,e,f,g,h,_i) \
1544 VPALIGNR (XTMP0, X3, X2, 4)\
1545 VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */\
1546 RND_STEP_RORX_0_2(a,b,c,d,e,f,g,h,_i) \
1547 RND_STEP_RORX_0_3(a,b,c,d,e,f,g,h,_i) \
1548 VPSRLD (XTMP2, XTMP1, 7)\
1549 VPSLLD (XTMP3, XTMP1, 25) /* VPSLLD (XTMP3, XTMP1, (32-7)) */\
1550 RND_STEP_RORX_0_4(a,b,c,d,e,f,g,h,_i) \
1551 RND_STEP_RORX_0_5(a,b,c,d,e,f,g,h,_i) \
1552 VPSRLD (XTMP4, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */ \
1553 VPOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 */\
1554 RND_STEP_RORX_0_6(a,b,c,d,e,f,g,h,_i) \
1555 RND_STEP_RORX_0_7(a,b,c,d,e,f,g,h,_i) \
1556 RND_STEP_RORX_0_8(a,b,c,d,e,f,g,h,_i) \
1557\
1558 RND_STEP_RORX_1_1(h,a,b,c,d,e,f,g,_i+1) \
1559 VPSRLD (XTMP2, XTMP1,18) \
1560 RND_STEP_RORX_1_2(h,a,b,c,d,e,f,g,_i+1) \
1561 VPSLLD (XTMP1, XTMP1, 14) /* VPSLLD (XTMP1, XTMP1, (32-18)) */\
1562 RND_STEP_RORX_1_3(h,a,b,c,d,e,f,g,_i+1) \
1563 VPXOR (XTMP3, XTMP3, XTMP1)\
1564 RND_STEP_RORX_1_4(h,a,b,c,d,e,f,g,_i+1) \
1565 VPXOR (XTMP3, XTMP3, XTMP2) \
1566 /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */ \
1567 RND_STEP_RORX_1_5(h,a,b,c,d,e,f,g,_i+1) \
1568 VPSHUFD(XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/\
1569 RND_STEP_RORX_1_6(h,a,b,c,d,e,f,g,_i+1) \
1570 VPXOR (XTMP1, XTMP3, XTMP4) /* XTMP1 = s0 */ \
1571 RND_STEP_RORX_1_7(h,a,b,c,d,e,f,g,_i+1) \
1572 VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */\
1573 RND_STEP_RORX_1_8(h,a,b,c,d,e,f,g,_i+1) \
1574 \
1575 RND_STEP_RORX_0_1(g,h,a,b,c,d,e,f,_i+2) \
1576 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
1577 RND_STEP_RORX_0_2(g,h,a,b,c,d,e,f,_i+2) \
1578 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
1579 VPADDD (XTMP0, XTMP0, X0) \
1580 RND_STEP_RORX_0_3(g,h,a,b,c,d,e,f,_i+2) \
1581 VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */ \
1582 RND_STEP_RORX_0_4(g,h,a,b,c,d,e,f,_i+2) \
1583 VPXOR (XTMP2, XTMP2, XTMP3)\
1584 RND_STEP_RORX_0_5(g,h,a,b,c,d,e,f,_i+2) \
1585 VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */\
1586 RND_STEP_RORX_0_6(g,h,a,b,c,d,e,f,_i+2) \
1587 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */\
1588 RND_STEP_RORX_0_7(g,h,a,b,c,d,e,f,_i+2) \
1589 VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */\
1590 RND_STEP_RORX_0_8(g,h,a,b,c,d,e,f,_i+2) \
1591\
1592 RND_STEP_RORX_1_1(f,g,h,a,b,c,d,e,_i+3) \
1593 VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */\
1594 RND_STEP_RORX_1_2(f,g,h,a,b,c,d,e,_i+3) \
1595 VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */\
1596 RND_STEP_RORX_1_3(f,g,h,a,b,c,d,e,_i+3) \
1597 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
1598 RND_STEP_RORX_1_4(f,g,h,a,b,c,d,e,_i+3) \
1599 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
1600 RND_STEP_RORX_1_5(f,g,h,a,b,c,d,e,_i+3) \
1601 VPXOR (XTMP2, XTMP2, XTMP3)\
1602 RND_STEP_RORX_1_6(f,g,h,a,b,c,d,e,_i+3) \
1603 VPXOR (XTMP5, XTMP5, XTMP2) /* XTMP5 = s1 {xDxC} */\
1604 RND_STEP_RORX_1_7(f,g,h,a,b,c,d,e,_i+3) \
1605 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */\
1606 RND_STEP_RORX_1_8(f,g,h,a,b,c,d,e,_i+3) \
1607 VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */
1608
1609#endif /* HAVE_INTEL_RORX */
1610
1611
1612#define _W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) \
1613 "# X0, X1, X2, X3 = W[0..15]\n\t" \
1614 "vmovdqu (%%rax), %" #X0 "\n\t" \
1615 "vmovdqu 16(%%rax), %" #X1 "\n\t" \
1616 VPSHUFB(X0, X0, BYTE_FLIP_MASK) \
1617 VPSHUFB(X1, X1, BYTE_FLIP_MASK) \
1618 "vmovdqu 32(%%rax), %" #X2 "\n\t" \
1619 "vmovdqu 48(%%rax), %" #X3 "\n\t" \
1620 VPSHUFB(X2, X2, BYTE_FLIP_MASK) \
1621 VPSHUFB(X3, X3, BYTE_FLIP_MASK)
1622
1623#define W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) \
1624 _W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK)
1625
1626
1627#define _SET_W_K_XFER_4(i) \
1628 "vpaddd (" #i "*4)+ 0+%[K], %%xmm0, %%xmm4\n\t" \
1629 "vpaddd (" #i "*4)+16+%[K], %%xmm1, %%xmm5\n\t" \
1630 "vmovdqu %%xmm4, (" WK ")\n\t" \
1631 "vmovdqu %%xmm5, 16(" WK ")\n\t" \
1632 "vpaddd (" #i "*4)+32+%[K], %%xmm2, %%xmm6\n\t" \
1633 "vpaddd (" #i "*4)+48+%[K], %%xmm3, %%xmm7\n\t" \
1634 "vmovdqu %%xmm6, 32(" WK ")\n\t" \
1635 "vmovdqu %%xmm7, 48(" WK ")\n\t"
1636
1637#define SET_W_K_XFER_4(i) \
1638 _SET_W_K_XFER_4(i)
1639
1640
1641static const ALIGN32 word64 mSHUF_00BA[] =
1642 { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */
1643static const ALIGN32 word64 mSHUF_DC00[] =
1644 { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */
1645static const ALIGN32 word64 mBYTE_FLIP_MASK[] =
1646 { 0x0405060700010203, 0x0c0d0e0f08090a0b };
1647
1648#define _Init_Masks(mask1, mask2, mask3) \
1649 "vmovdqa %[FLIP], %" #mask1 "\n\t" \
1650 "vmovdqa %[SHUF00BA], %" #mask2 "\n\t" \
1651 "vmovdqa %[SHUFDC00], %" #mask3 "\n\t"
1652
1653#define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\
1654 _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
1655
1656#define X0 %xmm0
1657#define X1 %xmm1
1658#define X2 %xmm2
1659#define X3 %xmm3
1660
1661#define XTMP0 %xmm4
1662#define XTMP1 %xmm5
1663#define XTMP2 %xmm6
1664#define XTMP3 %xmm7
1665#define XTMP4 %xmm8
1666#define XTMP5 %xmm9
1667#define XFER %xmm10
1668
1669#define SHUF_00BA %xmm11 /* shuffle xBxA -> 00BA */
1670#define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */
1671#define BYTE_FLIP_MASK %xmm13
1672
1673
1674SHA256_NOINLINE static int Transform_Sha256_AVX1(wc_Sha256* sha256)
1675{
1676 __asm__ __volatile__ (
1677
1678 "subq $64, %%rsp\n\t"
1679
1680 "leaq 32(%[sha256]), %%rax\n\t"
1681 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
1682 LOAD_DIGEST()
1683
1684 W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK)
1685
1686 "movl %%r9d, " L4 "\n\t"
1687 "movl %%r12d, " L1 "\n\t"
1688 "xorl %%r10d, " L4 "\n\t"
1689
1690 SET_W_K_XFER_4(0)
1691 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1692 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1693 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1694 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1695
1696 SET_W_K_XFER_4(16)
1697 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1698 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1699 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1700 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1701
1702 SET_W_K_XFER_4(32)
1703 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1704 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1705 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1706 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1707
1708 SET_W_K_XFER_4(48)
1709 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1710 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1711 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1712 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1713
1714 STORE_ADD_DIGEST()
1715
1716 "addq $64, %%rsp\n\t"
1717
1718 :
1719 : [FLIP] "m" (mBYTE_FLIP_MASK[0]),
1720 [SHUF00BA] "m" (mSHUF_00BA[0]),
1721 [SHUFDC00] "m" (mSHUF_DC00[0]),
1722 [sha256] "r" (sha256),
1723 [K] "m" (K)
1724 : WORK_REGS, STATE_REGS, XMM_REGS, "memory"
1725 );
1726
1727 return 0;
1728}
1729
1730SHA256_NOINLINE static int Transform_Sha256_AVX1_Len(wc_Sha256* sha256,
1731 word32 len)
1732{
1733 __asm__ __volatile__ (
1734
1735 "subq $64, %%rsp\n\t"
1736 "movq 120(%[sha256]), %%rax\n\t"
1737
1738 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
1739 LOAD_DIGEST()
1740
1741 "# Start of loop processing a block\n"
1742 "1:\n\t"
1743
1744 W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK)
1745
1746 "movl %%r9d, " L4 "\n\t"
1747 "movl %%r12d, " L1 "\n\t"
1748 "xorl %%r10d, " L4 "\n\t"
1749
1750 SET_W_K_XFER_4(0)
1751 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1752 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1753 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1754 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1755
1756 SET_W_K_XFER_4(16)
1757 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1758 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1759 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1760 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1761
1762 SET_W_K_XFER_4(32)
1763 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1764 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1765 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1766 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1767
1768 SET_W_K_XFER_4(48)
1769 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1770 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1771 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1772 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1773 "movq 120(%[sha256]), %%rax\n\t"
1774
1775 ADD_DIGEST()
1776
1777 "addq $64, %%rax\n\t"
1778 "subl $64, %[len]\n\t"
1779
1780 STORE_DIGEST()
1781
1782 "movq %%rax, 120(%[sha256])\n\t"
1783 "jnz 1b\n\t"
1784
1785 "addq $64, %%rsp\n\t"
1786
1787 :
1788 : [FLIP] "m" (mBYTE_FLIP_MASK[0]),
1789 [SHUF00BA] "m" (mSHUF_00BA[0]),
1790 [SHUFDC00] "m" (mSHUF_DC00[0]),
1791 [sha256] "r" (sha256),
1792 [len] "r" (len),
1793 [K] "m" (K)
1794 : WORK_REGS, STATE_REGS, XMM_REGS, "memory"
1795 );
1796
1797 return 0;
1798}
1799#endif /* HAVE_INTEL_AVX1 */
1800
1801#if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX)
1802SHA256_NOINLINE static int Transform_Sha256_AVX1_RORX(wc_Sha256* sha256)
1803{
1804 __asm__ __volatile__ (
1805
1806 "subq $64, %%rsp\n\t"
1807
1808 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
1809 "leaq 32(%[sha256]), %%rax\n\t"
1810 W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK)
1811
1812 LOAD_DIGEST()
1813
1814 SET_W_K_XFER_4(0)
1815 "movl %%r9d, " L4 "\n\t"
1816 "rorx $6, %%r12d, " L1 "\n\t"
1817 "xorl %%r10d, " L4 "\n\t"
1818 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1819 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1820 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1821 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1822
1823 SET_W_K_XFER_4(16)
1824 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1825 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1826 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1827 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1828
1829 SET_W_K_XFER_4(32)
1830 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1831 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1832 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1833 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1834
1835 SET_W_K_XFER_4(48)
1836 "xorl " L3 ", " L3 "\n\t"
1837 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1838 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1839 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1840 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1841 /* Prev RND: h += Maj(a,b,c) */
1842 "addl " L3 ", %%r8d\n\t"
1843
1844 STORE_ADD_DIGEST()
1845
1846 "addq $64, %%rsp\n\t"
1847
1848 :
1849 : [FLIP] "m" (mBYTE_FLIP_MASK[0]),
1850 [SHUF00BA] "m" (mSHUF_00BA[0]),
1851 [SHUFDC00] "m" (mSHUF_DC00[0]),
1852 [sha256] "r" (sha256),
1853 [K] "m" (K)
1854 : WORK_REGS, STATE_REGS, XMM_REGS, "memory"
1855 );
1856
1857 return 0;
1858}
1859
1860SHA256_NOINLINE static int Transform_Sha256_AVX1_RORX_Len(wc_Sha256* sha256,
1861 word32 len)
1862{
1863 __asm__ __volatile__ (
1864
1865 "subq $64, %%rsp\n\t"
1866 "movq 120(%[sha256]), %%rax\n\t"
1867
1868 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
1869 LOAD_DIGEST()
1870
1871 "# Start of loop processing a block\n"
1872 "1:\n\t"
1873
1874 W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK)
1875
1876 SET_W_K_XFER_4(0)
1877 "movl %%r9d, " L4 "\n\t"
1878 "rorx $6, %%r12d, " L1 "\n\t"
1879 "xorl %%r10d, " L4 "\n\t"
1880 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1881 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1882 MsgSched_RORX(X2, X3, X0, X1, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8)
1883 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1884
1885 SET_W_K_XFER_4(16)
1886 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1887 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1888 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1889 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1890
1891 SET_W_K_XFER_4(32)
1892 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1893 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1894 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1895 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1896
1897 SET_W_K_XFER_4(48)
1898 "xorl " L3 ", " L3 "\n\t"
1899 "xorl " L2 ", " L2 "\n\t"
1900 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
1901 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4)
1902 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8)
1903 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
1904 /* Prev RND: h += Maj(a,b,c) */
1905 "addl " L3 ", %%r8d\n\t"
1906 "movq 120(%[sha256]), %%rax\n\t"
1907
1908 ADD_DIGEST()
1909
1910 "addq $64, %%rax\n\t"
1911 "subl $64, %[len]\n\t"
1912
1913 STORE_DIGEST()
1914
1915 "movq %%rax, 120(%[sha256])\n\t"
1916 "jnz 1b\n\t"
1917
1918 "addq $64, %%rsp\n\t"
1919
1920 :
1921 : [FLIP] "m" (mBYTE_FLIP_MASK[0]),
1922 [SHUF00BA] "m" (mSHUF_00BA[0]),
1923 [SHUFDC00] "m" (mSHUF_DC00[0]),
1924 [sha256] "r" (sha256),
1925 [len] "r" (len),
1926 [K] "m" (K)
1927 : WORK_REGS, STATE_REGS, XMM_REGS, "memory"
1928 );
1929
1930 return 0;
1931}
1932#endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_RORX */
1933
1934
1935#if defined(HAVE_INTEL_AVX2)
1936#define Y0 %ymm0
1937#define Y1 %ymm1
1938#define Y2 %ymm2
1939#define Y3 %ymm3
1940
1941#define YTMP0 %ymm4
1942#define YTMP1 %ymm5
1943#define YTMP2 %ymm6
1944#define YTMP3 %ymm7
1945#define YTMP4 %ymm8
1946#define YTMP5 %ymm9
1947#define YXFER %ymm10
1948
1949#define SHUF_Y_00BA %ymm11 /* shuffle xBxA -> 00BA */
1950#define SHUF_Y_DC00 %ymm12 /* shuffle xDxC -> DC00 */
1951#define BYTE_FLIP_Y_MASK %ymm13
1952
1953#define YMM_REGS "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", \
1954 "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13"
1955
1956#define MsgSched_Y(Y0,Y1,Y2,Y3,a,b,c,d,e,f,g,h,_i) \
1957 RND_STEP_0_1(a,b,c,d,e,f,g,h,_i) \
1958 VPALIGNR (YTMP1, Y1, Y0, 4) /* YTMP1 = W[-15] */ \
1959 VPALIGNR (YTMP0, Y3, Y2, 4) /* YTMP0 = W[-7] */ \
1960 RND_STEP_0_2(a,b,c,d,e,f,g,h,_i) \
1961 RND_STEP_0_3(a,b,c,d,e,f,g,h,_i) \
1962 VPSRLD (YTMP2, YTMP1, 7) /* YTMP2 = W[-15] >> 7 */ \
1963 VPSLLD (YTMP3, YTMP1, 25) /* YTEMP3 = W[-15] << (32-7) */ \
1964 RND_STEP_0_4(a,b,c,d,e,f,g,h,_i) \
1965 RND_STEP_0_5(a,b,c,d,e,f,g,h,_i) \
1966 VPSRLD (YTMP4, YTMP1, 18) /* YTEMP4 = W[-15] >> 18 */ \
1967 VPSLLD (YTMP5, YTMP1, 14) /* YTEMP5 = W[-15] << (32-18) */ \
1968 RND_STEP_0_6(a,b,c,d,e,f,g,h,_i) \
1969 RND_STEP_0_7(a,b,c,d,e,f,g,h,_i) \
1970 VPOR (YTMP2, YTMP3, YTMP2) /* YTMP2 = W[-15] >>> 7 */ \
1971 VPOR (YTMP4, YTMP5, YTMP4) /* YTMP4 = W[-15] >>> 18 */ \
1972 RND_STEP_0_8(a,b,c,d,e,f,g,h,_i) \
1973 RND_STEP_1_1(h,a,b,c,d,e,f,g,_i+1) \
1974 RND_STEP_1_2(h,a,b,c,d,e,f,g,_i+1) \
1975 VPSRLD (YTMP5, YTMP1, 3) /* YTMP4 = W[-15] >> 3 */ \
1976 VPXOR (YTMP2, YTMP4, YTMP2) /* YTMP2 = W[-15] >>> 7 ^ W[-15] >>> 18 */ \
1977 RND_STEP_1_3(h,a,b,c,d,e,f,g,_i+1) \
1978 RND_STEP_1_4(h,a,b,c,d,e,f,g,_i+1) \
1979 VPXOR (YTMP1, YTMP5, YTMP2) /* YTMP1 = s0 */ \
1980 VPSHUFD (YTMP2, Y3, 0b11111010) /* YTMP2 = W[-2] {BBAA}*/ \
1981 RND_STEP_1_5(h,a,b,c,d,e,f,g,_i+1) \
1982 RND_STEP_1_6(h,a,b,c,d,e,f,g,_i+1) \
1983 VPSRLD (YTMP4, YTMP2, 10) /* YTMP4 = W[-2] >> 10 {BBAA} */ \
1984 VPSRLQ (YTMP3, YTMP2, 19) /* YTMP3 = W[-2] MY_ROR 19 {xBxA} */ \
1985 RND_STEP_1_7(h,a,b,c,d,e,f,g,_i+1) \
1986 RND_STEP_1_8(h,a,b,c,d,e,f,g,_i+1) \
1987 RND_STEP_0_1(g,h,a,b,c,d,e,f,_i+2) \
1988 VPSRLQ (YTMP2, YTMP2, 17) /* YTMP2 = W[-2] MY_ROR 17 {xBxA} */ \
1989 VPADDD (YTMP0, YTMP0, Y0) \
1990 RND_STEP_0_2(g,h,a,b,c,d,e,f,_i+2) \
1991 RND_STEP_0_3(g,h,a,b,c,d,e,f,_i+2) \
1992 RND_STEP_0_4(g,h,a,b,c,d,e,f,_i+2) \
1993 VPXOR (YTMP2, YTMP3, YTMP2) \
1994 VPADDD (YTMP0, YTMP0, YTMP1) /* YTMP0 = W[-16] + W[-7] + s0 */ \
1995 RND_STEP_0_5(g,h,a,b,c,d,e,f,_i+2) \
1996 VPXOR (YTMP4, YTMP4, YTMP2) /* YTMP4 = s1 {xBxA} */ \
1997 RND_STEP_0_6(g,h,a,b,c,d,e,f,_i+2) \
1998 VPSHUFB (YTMP4, YTMP4, SHUF_Y_00BA) /* YTMP4 = s1 {00BA} */ \
1999 RND_STEP_0_7(g,h,a,b,c,d,e,f,_i+2) \
2000 VPADDD (YTMP0, YTMP0, YTMP4) /* YTMP0 = {..., ..., W[1], W[0]} */ \
2001 RND_STEP_0_8(g,h,a,b,c,d,e,f,_i+2) \
2002 RND_STEP_1_1(f,g,h,a,b,c,d,e,_i+3) \
2003 VPSHUFD (YTMP2, YTMP0, 0b01010000) /* YTMP2 = W[-2] {DDCC} */ \
2004 RND_STEP_1_2(f,g,h,a,b,c,d,e,_i+3) \
2005 VPSRLQ (YTMP4, YTMP2, 17) /* YTMP4 = W[-2] MY_ROR 17 {xDxC} */ \
2006 VPSRLQ (YTMP3, YTMP2, 19) /* YTMP3 = W[-2] MY_ROR 19 {xDxC} */ \
2007 RND_STEP_1_3(f,g,h,a,b,c,d,e,_i+3) \
2008 RND_STEP_1_4(f,g,h,a,b,c,d,e,_i+3) \
2009 VPSRLD (YTMP5, YTMP2, 10) /* YTMP5 = W[-2] >> 10 {DDCC} */ \
2010 VPXOR (YTMP4, YTMP3, YTMP4) \
2011 RND_STEP_1_5(f,g,h,a,b,c,d,e,_i+3) \
2012 RND_STEP_1_6(f,g,h,a,b,c,d,e,_i+3) \
2013 VPXOR (YTMP5, YTMP4, YTMP5) /* YTMP5 = s1 {xDxC} */ \
2014 RND_STEP_1_7(f,g,h,a,b,c,d,e,_i+3) \
2015 VPSHUFB (YTMP5, YTMP5, SHUF_Y_DC00) /* YTMP5 = s1 {DC00} */ \
2016 RND_STEP_1_8(f,g,h,a,b,c,d,e,_i+3) \
2017 VPADDD (Y0, YTMP5, YTMP0) /* Y0 = {W[3], W[2], W[1], W[0]} */
2018
2019#if defined(HAVE_INTEL_RORX)
2020
2021#define MsgSched_Y_RORX(Y0,Y1,Y2,Y3,a,b,c,d,e,f,g,h,_i) \
2022 RND_STEP_RORX_0_1(a,b,c,d,e,f,g,h,_i) \
2023 VPALIGNR (YTMP1, Y1, Y0, 4) /* YTMP1 = W[-15] */ \
2024 RND_STEP_RORX_0_2(a,b,c,d,e,f,g,h,_i) \
2025 VPALIGNR (YTMP0, Y3, Y2, 4) /* YTMP0 = W[-7] */ \
2026 RND_STEP_RORX_0_3(a,b,c,d,e,f,g,h,_i) \
2027 VPSRLD (YTMP2, YTMP1, 7) /* YTMP2 = W[-15] >> 7 */ \
2028 RND_STEP_RORX_0_4(a,b,c,d,e,f,g,h,_i) \
2029 VPSLLD (YTMP3, YTMP1, 25) /* YTEMP3 = W[-15] << (32-7) */ \
2030 RND_STEP_RORX_0_5(a,b,c,d,e,f,g,h,_i) \
2031 VPSRLD (YTMP4, YTMP1, 18) /* YTEMP4 = W[-15] >> 18 */ \
2032 RND_STEP_RORX_0_6(a,b,c,d,e,f,g,h,_i) \
2033 VPSLLD (YTMP5, YTMP1, 14) /* YTEMP5 = W[-15] << (32-18) */ \
2034 RND_STEP_RORX_0_7(a,b,c,d,e,f,g,h,_i) \
2035 VPOR (YTMP2, YTMP2, YTMP3) /* YTMP2 = W[-15] >>> 7 */ \
2036 RND_STEP_RORX_0_8(a,b,c,d,e,f,g,h,_i) \
2037 VPOR (YTMP4, YTMP4, YTMP5) /* YTMP4 = W[-15] >>> 18 */ \
2038 RND_STEP_RORX_1_1(h,a,b,c,d,e,f,g,_i+1) \
2039 VPSRLD (YTMP5, YTMP1, 3) /* YTMP4 = W[-15] >> 3 */ \
2040 RND_STEP_RORX_1_2(h,a,b,c,d,e,f,g,_i+1) \
2041 VPXOR (YTMP2, YTMP2, YTMP4) /* YTMP2 = W[-15] >>> 7 ^ W[-15] >>> 18 */ \
2042 RND_STEP_RORX_1_3(h,a,b,c,d,e,f,g,_i+1) \
2043 VPSHUFD (YTMP3, Y3, 0b11111010) /* YTMP2 = W[-2] {BBAA}*/ \
2044 RND_STEP_RORX_1_4(h,a,b,c,d,e,f,g,_i+1) \
2045 VPXOR (YTMP1, YTMP5, YTMP2) /* YTMP1 = s0 */ \
2046 RND_STEP_RORX_1_5(h,a,b,c,d,e,f,g,_i+1) \
2047 VPSRLD (YTMP4, YTMP3, 10) /* YTMP4 = W[-2] >> 10 {BBAA} */ \
2048 RND_STEP_RORX_1_6(h,a,b,c,d,e,f,g,_i+1) \
2049 VPSRLQ (YTMP2, YTMP3, 19) /* YTMP3 = W[-2] MY_ROR 19 {xBxA} */ \
2050 RND_STEP_RORX_1_7(h,a,b,c,d,e,f,g,_i+1) \
2051 VPSRLQ (YTMP3, YTMP3, 17) /* YTMP2 = W[-2] MY_ROR 17 {xBxA} */ \
2052 RND_STEP_RORX_1_8(h,a,b,c,d,e,f,g,_i+1) \
2053 VPADDD (YTMP0, YTMP0, Y0) \
2054 RND_STEP_RORX_0_1(g,h,a,b,c,d,e,f,_i+2) \
2055 VPXOR (YTMP2, YTMP2, YTMP3) \
2056 RND_STEP_RORX_0_2(g,h,a,b,c,d,e,f,_i+2) \
2057 VPXOR (YTMP4, YTMP4, YTMP2) /* YTMP4 = s1 {xBxA} */ \
2058 RND_STEP_RORX_0_3(g,h,a,b,c,d,e,f,_i+2) \
2059 VPADDD (YTMP0, YTMP0, YTMP1) /* YTMP0 = W[-16] + W[-7] + s0 */ \
2060 RND_STEP_RORX_0_4(g,h,a,b,c,d,e,f,_i+2) \
2061 VPSHUFB (YTMP4, YTMP4, SHUF_Y_00BA) /* YTMP4 = s1 {00BA} */ \
2062 RND_STEP_RORX_0_5(g,h,a,b,c,d,e,f,_i+2) \
2063 VPADDD (YTMP0, YTMP0, YTMP4) /* YTMP0 = {..., ..., W[1], W[0]} */ \
2064 RND_STEP_RORX_0_6(g,h,a,b,c,d,e,f,_i+2) \
2065 VPSHUFD (YTMP2, YTMP0, 0b01010000) /* YTMP2 = W[-2] {DDCC} */ \
2066 RND_STEP_RORX_0_7(g,h,a,b,c,d,e,f,_i+2) \
2067 RND_STEP_RORX_0_8(g,h,a,b,c,d,e,f,_i+2) \
2068 VPSRLQ (YTMP4, YTMP2, 17) /* YTMP4 = W[-2] MY_ROR 17 {xDxC} */ \
2069 RND_STEP_RORX_1_1(f,g,h,a,b,c,d,e,_i+3) \
2070 VPSRLQ (YTMP3, YTMP2, 19) /* YTMP3 = W[-2] MY_ROR 19 {xDxC} */ \
2071 RND_STEP_RORX_1_2(f,g,h,a,b,c,d,e,_i+3) \
2072 VPSRLD (YTMP5, YTMP2, 10) /* YTMP5 = W[-2] >> 10 {DDCC} */ \
2073 RND_STEP_RORX_1_3(f,g,h,a,b,c,d,e,_i+3) \
2074 VPXOR (YTMP4, YTMP4, YTMP3) \
2075 RND_STEP_RORX_1_4(f,g,h,a,b,c,d,e,_i+3) \
2076 VPXOR (YTMP5, YTMP5, YTMP4) /* YTMP5 = s1 {xDxC} */ \
2077 RND_STEP_RORX_1_5(f,g,h,a,b,c,d,e,_i+3) \
2078 RND_STEP_RORX_1_6(f,g,h,a,b,c,d,e,_i+3) \
2079 VPSHUFB (YTMP5, YTMP5, SHUF_Y_DC00) /* YTMP5 = s1 {DC00} */ \
2080 RND_STEP_RORX_1_7(f,g,h,a,b,c,d,e,_i+3) \
2081 RND_STEP_RORX_1_8(f,g,h,a,b,c,d,e,_i+3) \
2082 VPADDD (Y0, YTMP5, YTMP0) /* Y0 = {W[3], W[2], W[1], W[0]} */ \
2083
2084#endif /* HAVE_INTEL_RORX */
2085
2086#define _VINSERTI128(op1,op2,op3,op4) \
2087 "vinserti128 $" #op4 ", %" #op3 ", %" #op2 ", %" #op1 "\n\t"
2088#define VINSERTI128(op1,op2,op3,op4) \
2089 _VINSERTI128(op1,op2,op3,op4)
2090
2091
2092#define _LOAD_W_K_LOW(BYTE_FLIP_MASK, reg) \
2093 "# X0, X1, X2, X3 = W[0..15]\n\t" \
2094 "vmovdqu (%%" #reg "), %%xmm0\n\t" \
2095 "vmovdqu 16(%%" #reg "), %%xmm1\n\t" \
2096 VPSHUFB(X0, X0, BYTE_FLIP_MASK) \
2097 VPSHUFB(X1, X1, BYTE_FLIP_MASK) \
2098 "vmovdqu 32(%%" #reg "), %%xmm2\n\t" \
2099 "vmovdqu 48(%%" #reg "), %%xmm3\n\t" \
2100 VPSHUFB(X2, X2, BYTE_FLIP_MASK) \
2101 VPSHUFB(X3, X3, BYTE_FLIP_MASK)
2102
2103#define LOAD_W_K_LOW(BYTE_FLIP_MASK, reg) \
2104 _LOAD_W_K_LOW(BYTE_FLIP_MASK, reg)
2105
2106
2107#define _LOAD_W_K(BYTE_FLIP_Y_MASK, reg) \
2108 "# X0, X1, X2, X3 = W[0..15]\n\t" \
2109 "vmovdqu (%%" #reg "), %%xmm0\n\t" \
2110 "vmovdqu 16(%%" #reg "), %%xmm1\n\t" \
2111 "vmovdqu 64(%%" #reg "), %%xmm4\n\t" \
2112 "vmovdqu 80(%%" #reg "), %%xmm5\n\t" \
2113 VINSERTI128(Y0, Y0, XTMP0, 1) \
2114 VINSERTI128(Y1, Y1, XTMP1, 1) \
2115 VPSHUFB(Y0, Y0, BYTE_FLIP_Y_MASK) \
2116 VPSHUFB(Y1, Y1, BYTE_FLIP_Y_MASK) \
2117 "vmovdqu 32(%%" #reg "), %%xmm2\n\t" \
2118 "vmovdqu 48(%%" #reg "), %%xmm3\n\t" \
2119 "vmovdqu 96(%%" #reg "), %%xmm6\n\t" \
2120 "vmovdqu 112(%%" #reg "), %%xmm7\n\t" \
2121 VINSERTI128(Y2, Y2, XTMP2, 1) \
2122 VINSERTI128(Y3, Y3, XTMP3, 1) \
2123 VPSHUFB(Y2, Y2, BYTE_FLIP_Y_MASK) \
2124 VPSHUFB(Y3, Y3, BYTE_FLIP_Y_MASK)
2125
2126#define LOAD_W_K(BYTE_FLIP_Y_MASK, reg) \
2127 _LOAD_W_K(BYTE_FLIP_Y_MASK, reg)
2128
2129
2130#define _SET_W_Y_4(i) \
2131 "vpaddd (" #i "*8)+ 0+%[K], %%ymm0, %%ymm4\n\t" \
2132 "vpaddd (" #i "*8)+32+%[K], %%ymm1, %%ymm5\n\t" \
2133 "vmovdqu %%ymm4, (" #i "*8)+ 0(" WK ")\n\t" \
2134 "vmovdqu %%ymm5, (" #i "*8)+32(" WK ")\n\t" \
2135 "vpaddd (" #i "*8)+64+%[K], %%ymm2, %%ymm4\n\t" \
2136 "vpaddd (" #i "*8)+96+%[K], %%ymm3, %%ymm5\n\t" \
2137 "vmovdqu %%ymm4, (" #i "*8)+64(" WK ")\n\t" \
2138 "vmovdqu %%ymm5, (" #i "*8)+96(" WK ")\n\t"
2139
2140#define SET_W_Y_4(i) \
2141 _SET_W_Y_4(i)
2142
2143
2144static const ALIGN32 word64 mSHUF_Y_00BA[] =
2145 { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF,
2146 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */
2147static const ALIGN32 word64 mSHUF_Y_DC00[] =
2148 { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100,
2149 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */
2150static const ALIGN32 word64 mBYTE_FLIP_Y_MASK[] =
2151 { 0x0405060700010203, 0x0c0d0e0f08090a0b,
2152 0x0405060700010203, 0x0c0d0e0f08090a0b };
2153
2154#define _INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) \
2155 "vmovdqa %[FLIP], %" #BYTE_FLIP_MASK "\n\t" \
2156 "vmovdqa %[SHUF00BA], %" #SHUF_00BA "\n\t" \
2157 "vmovdqa %[SHUFDC00], %" #SHUF_DC00 "\n\t"
2158
2159#define INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) \
2160 _INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
2161
2162static const ALIGN32 word32 K256[128] = {
2163 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L,
2164 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L,
2165 0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L,
2166 0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L,
2167 0xD807AA98L, 0x12835B01L, 0x243185BEL, 0x550C7DC3L,
2168 0xD807AA98L, 0x12835B01L, 0x243185BEL, 0x550C7DC3L,
2169 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, 0xC19BF174L,
2170 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, 0xC19BF174L,
2171 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
2172 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
2173 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL,
2174 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL,
2175 0x983E5152L, 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L,
2176 0x983E5152L, 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L,
2177 0xC6E00BF3L, 0xD5A79147L, 0x06CA6351L, 0x14292967L,
2178 0xC6E00BF3L, 0xD5A79147L, 0x06CA6351L, 0x14292967L,
2179 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, 0x53380D13L,
2180 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, 0x53380D13L,
2181 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
2182 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
2183 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L,
2184 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L,
2185 0xD192E819L, 0xD6990624L, 0xF40E3585L, 0x106AA070L,
2186 0xD192E819L, 0xD6990624L, 0xF40E3585L, 0x106AA070L,
2187 0x19A4C116L, 0x1E376C08L, 0x2748774CL, 0x34B0BCB5L,
2188 0x19A4C116L, 0x1E376C08L, 0x2748774CL, 0x34B0BCB5L,
2189 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, 0x682E6FF3L,
2190 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, 0x682E6FF3L,
2191 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
2192 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
2193 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L,
2194 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
2195};
2196
2197SHA256_NOINLINE static int Transform_Sha256_AVX2(wc_Sha256* sha256)
2198{
2199 __asm__ __volatile__ (
2200
2201 "subq $512, %%rsp\n\t"
2202 "leaq 32(%[sha256]), %%rax\n\t"
2203
2204 INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_Y_00BA, SHUF_Y_DC00)
2205 LOAD_DIGEST()
2206
2207 LOAD_W_K_LOW(BYTE_FLIP_MASK, rax)
2208
2209 "movl %%r9d, " L4 "\n\t"
2210 "movl %%r12d, " L1 "\n\t"
2211 "xorl %%r10d, " L4 "\n\t"
2212
2213 SET_W_Y_4(0)
2214 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
2215 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8)
2216 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16)
2217 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24)
2218
2219 SET_W_Y_4(16)
2220 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32)
2221 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40)
2222 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48)
2223 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56)
2224
2225 SET_W_Y_4(32)
2226 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64)
2227 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72)
2228 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80)
2229 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88)
2230
2231 SET_W_Y_4(48)
2232 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96)
2233 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104)
2234 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112)
2235 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120)
2236
2237 STORE_ADD_DIGEST()
2238
2239 "addq $512, %%rsp\n\t"
2240
2241 :
2242 : [FLIP] "m" (mBYTE_FLIP_MASK[0]),
2243 [SHUF00BA] "m" (mSHUF_Y_00BA[0]),
2244 [SHUFDC00] "m" (mSHUF_Y_DC00[0]),
2245 [sha256] "r" (sha256),
2246 [K] "m" (K256)
2247 : WORK_REGS, STATE_REGS, YMM_REGS, "memory"
2248 );
2249
2250 return 0;
2251}
2252
2253SHA256_NOINLINE static int Transform_Sha256_AVX2_Len(wc_Sha256* sha256,
2254 word32 len)
2255{
2256 if ((len & WC_SHA256_BLOCK_SIZE) != 0) {
2257 XMEMCPY(sha256->buffer, sha256->data, WC_SHA256_BLOCK_SIZE);
2258 Transform_Sha256_AVX2(sha256);
2259 sha256->data += WC_SHA256_BLOCK_SIZE;
2260 len -= WC_SHA256_BLOCK_SIZE;
2261 if (len == 0)
2262 return 0;
2263}
2264
2265 __asm__ __volatile__ (
2266
2267 "subq $512, %%rsp\n\t"
2268 "movq 120(%[sha256]), %%rax\n\t"
2269
2270 INIT_MASKS_Y(BYTE_FLIP_Y_MASK, SHUF_Y_00BA, SHUF_Y_DC00)
2271 LOAD_DIGEST()
2272
2273 "# Start of loop processing two blocks\n"
2274 "1:\n\t"
2275
2276 LOAD_W_K(BYTE_FLIP_Y_MASK, rax)
2277
2278 "movl %%r9d, " L4 "\n\t"
2279 "movl %%r12d, " L1 "\n\t"
2280 "xorl %%r10d, " L4 "\n\t"
2281
2282 SET_W_Y_4(0)
2283 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
2284 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8)
2285 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16)
2286 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24)
2287
2288 SET_W_Y_4(16)
2289 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32)
2290 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40)
2291 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48)
2292 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56)
2293
2294 SET_W_Y_4(32)
2295 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64)
2296 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72)
2297 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80)
2298 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88)
2299
2300 SET_W_Y_4(48)
2301 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96)
2302 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104)
2303 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112)
2304 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120)
2305
2306 ADD_DIGEST()
2307 STORE_DIGEST()
2308
2309 "movl %%r9d, " L4 "\n\t"
2310 "movl %%r12d, " L1 "\n\t"
2311 "xorl %%r10d, " L4 "\n\t"
2312
2313 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 4)
2314 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
2315 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 20)
2316 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 28)
2317 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 36)
2318 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 44)
2319 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 52)
2320 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 60)
2321 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 68)
2322 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 76)
2323 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 84)
2324 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 92)
2325 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 100)
2326 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 108)
2327 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 116)
2328 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 124)
2329
2330 ADD_DIGEST()
2331
2332 "movq 120(%[sha256]), %%rax\n\t"
2333 "addq $128, %%rax\n\t"
2334 "subl $128, %[len]\n\t"
2335
2336 STORE_DIGEST()
2337
2338 "movq %%rax, 120(%[sha256])\n\t"
2339 "jnz 1b\n\t"
2340
2341 "addq $512, %%rsp\n\t"
2342
2343 :
2344 : [FLIP] "m" (mBYTE_FLIP_Y_MASK[0]),
2345 [SHUF00BA] "m" (mSHUF_Y_00BA[0]),
2346 [SHUFDC00] "m" (mSHUF_Y_DC00[0]),
2347 [sha256] "r" (sha256),
2348 [len] "r" (len),
2349 [K] "m" (K256)
2350 : WORK_REGS, STATE_REGS, YMM_REGS, "memory"
2351 );
2352
2353 return 0;
2354 }
2355
2356#if defined(HAVE_INTEL_RORX)
2357SHA256_NOINLINE static int Transform_Sha256_AVX2_RORX(wc_Sha256* sha256)
2358{
2359 __asm__ __volatile__ (
2360
2361 "subq $512, %%rsp\n\t"
2362 "leaq 32(%[sha256]), %%rax\n\t"
2363
2364 INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_Y_00BA, SHUF_Y_DC00)
2365 LOAD_W_K_LOW(BYTE_FLIP_MASK, rax)
2366
2367 LOAD_DIGEST()
2368
2369 "movl %%r9d, " L4 "\n\t"
2370 "rorx $6, %%r12d, " L1 "\n\t"
2371 "xorl %%r10d, " L4 "\n\t"
2372
2373 SET_W_Y_4(0)
2374 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
2375 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8)
2376 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16)
2377 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24)
2378
2379 SET_W_Y_4(16)
2380 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32)
2381 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40)
2382 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48)
2383 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56)
2384
2385 SET_W_Y_4(32)
2386 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64)
2387 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72)
2388 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80)
2389 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88)
2390
2391 SET_W_Y_4(48)
2392 "xorl " L3 ", " L3 "\n\t"
2393 "xorl " L2 ", " L2 "\n\t"
2394 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96)
2395 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104)
2396 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112)
2397 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120)
2398 /* Prev RND: h += Maj(a,b,c) */
2399 "addl " L3 ", %%r8d\n\t"
2400
2401 STORE_ADD_DIGEST()
2402
2403 "addq $512, %%rsp\n\t"
2404
2405 :
2406 : [FLIP] "m" (mBYTE_FLIP_MASK[0]),
2407 [SHUF00BA] "m" (mSHUF_Y_00BA[0]),
2408 [SHUFDC00] "m" (mSHUF_Y_DC00[0]),
2409 [sha256] "r" (sha256),
2410 [K] "m" (K256)
2411 : WORK_REGS, STATE_REGS, YMM_REGS, "memory"
2412 );
2413
2414 return 0;
2415 }
2416
2417SHA256_NOINLINE static int Transform_Sha256_AVX2_RORX_Len(wc_Sha256* sha256,
2418 word32 len)
2419 {
2420 if ((len & WC_SHA256_BLOCK_SIZE) != 0) {
2421 XMEMCPY(sha256->buffer, sha256->data, WC_SHA256_BLOCK_SIZE);
2422 Transform_Sha256_AVX2_RORX(sha256);
2423 sha256->data += WC_SHA256_BLOCK_SIZE;
2424 len -= WC_SHA256_BLOCK_SIZE;
2425 if (len == 0)
2426 return 0;
2427 }
2428
2429 __asm__ __volatile__ (
2430
2431 "subq $512, %%rsp\n\t"
2432 "movq 120(%[sha256]), %%rax\n\t"
2433
2434 INIT_MASKS_Y(BYTE_FLIP_Y_MASK, SHUF_Y_00BA, SHUF_Y_DC00)
2435 LOAD_DIGEST()
2436
2437 "# Start of loop processing two blocks\n"
2438 "1:\n\t"
2439
2440 LOAD_W_K(BYTE_FLIP_Y_MASK, rax)
2441
2442 "movl %%r9d, " L4 "\n\t"
2443 "rorx $6, %%r12d, " L1 "\n\t"
2444 "xorl %%r10d, " L4 "\n\t"
2445
2446 SET_W_Y_4(0)
2447 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0)
2448 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8)
2449 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16)
2450 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24)
2451
2452 SET_W_Y_4(16)
2453 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32)
2454 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40)
2455 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48)
2456 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56)
2457
2458 SET_W_Y_4(32)
2459 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64)
2460 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72)
2461 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80)
2462 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88)
2463
2464 SET_W_Y_4(48)
2465 "xorl " L3 ", " L3 "\n\t"
2466 "xorl " L2 ", " L2 "\n\t"
2467 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96)
2468 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104)
2469 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112)
2470 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120)
2471 /* Prev RND: h += Maj(a,b,c) */
2472 "addl " L3 ", %%r8d\n\t"
2473 "xorl " L2 ", " L2 "\n\t"
2474
2475 ADD_DIGEST()
2476 STORE_DIGEST()
2477
2478 "movl %%r9d, " L4 "\n\t"
2479 "xorl " L3 ", " L3 "\n\t"
2480 "xorl %%r10d, " L4 "\n\t"
2481
2482 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 4)
2483 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
2484 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 20)
2485 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 28)
2486 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 36)
2487 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 44)
2488 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 52)
2489 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 60)
2490 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 68)
2491 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 76)
2492 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 84)
2493 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 92)
2494 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 100)
2495 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 108)
2496 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 116)
2497 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 124)
2498 /* Prev RND: h += Maj(a,b,c) */
2499 "addl " L3 ", %%r8d\n\t"
2500 "movq 120(%[sha256]), %%rax\n\t"
2501
2502 ADD_DIGEST()
2503
2504 "addq $128, %%rax\n\t"
2505 "subl $128, %[len]\n\t"
2506
2507 STORE_DIGEST()
2508
2509 "movq %%rax, 120(%[sha256])\n\t"
2510 "jnz 1b\n\t"
2511
2512 "addq $512, %%rsp\n\t"
2513
2514 :
2515 : [FLIP] "m" (mBYTE_FLIP_Y_MASK[0]),
2516 [SHUF00BA] "m" (mSHUF_Y_00BA[0]),
2517 [SHUFDC00] "m" (mSHUF_Y_DC00[0]),
2518 [sha256] "r" (sha256),
2519 [len] "r" (len),
2520 [K] "m" (K256)
2521 : WORK_REGS, STATE_REGS, YMM_REGS, "memory"
2522 );
2523
2524 return 0;
2525 }
2526#endif /* HAVE_INTEL_RORX */
2527#endif /* HAVE_INTEL_AVX2 */
2528
2529
2530#ifdef WOLFSSL_SHA224
2531
2532#ifdef STM32_HASH_SHA2
2533
2534 /* Supports CubeMX HAL or Standard Peripheral Library */
2535
2536 int wc_InitSha224_ex(wc_Sha224* sha224, void* heap, int devId)
2537 {
2538 if (sha224 == NULL)
2539 return BAD_FUNC_ARG;
2540
2541 (void)devId;
2542 (void)heap;
2543
2544 wc_Stm32_Hash_Init(&sha224->stmCtx);
2545 return 0;
2546 }
2547
2548 int wc_Sha224Update(wc_Sha224* sha224, const byte* data, word32 len)
2549 {
2550 int ret = 0;
2551
2552 if (sha224 == NULL || (data == NULL && len > 0)) {
2553 return BAD_FUNC_ARG;
2554 }
2555
2556 ret = wolfSSL_CryptHwMutexLock();
2557 if (ret == 0) {
2558 ret = wc_Stm32_Hash_Update(&sha224->stmCtx,
2559 HASH_AlgoSelection_SHA224, data, len);
2560 wolfSSL_CryptHwMutexUnLock();
2561 }
2562 return ret;
2563 }
2564
2565 int wc_Sha224Final(wc_Sha224* sha224, byte* hash)
2566 {
2567 int ret = 0;
2568
2569 if (sha224 == NULL || hash == NULL) {
2570 return BAD_FUNC_ARG;
2571 }
2572
2573 ret = wolfSSL_CryptHwMutexLock();
2574 if (ret == 0) {
2575 ret = wc_Stm32_Hash_Final(&sha224->stmCtx,
2576 HASH_AlgoSelection_SHA224, hash, WC_SHA224_DIGEST_SIZE);
2577 wolfSSL_CryptHwMutexUnLock();
2578 }
2579
2580 (void)wc_InitSha224(sha224); /* reset state */
2581
2582 return ret;
2583 }
2584
2585#elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH)
2586 /* functions defined in wolfcrypt/src/port/caam/caam_sha256.c */
2587
2588#elif defined(WOLFSSL_AFALG_HASH)
2589 #error SHA224 currently not supported with AF_ALG enabled
2590
2591#elif defined(WOLFSSL_DEVCRYPTO_HASH)
2592 /* implemented in wolfcrypt/src/port/devcrypto/devcrypt_hash.c */
2593
2594#else
2595
2596 #define NEED_SOFT_SHA224
2597
2598
2599 static int InitSha224(wc_Sha224* sha224)
2600 {
2601 int ret = 0;
2602
2603 if (sha224 == NULL) {
2604 return BAD_FUNC_ARG;
2605 }
2606
2607 sha224->digest[0] = 0xc1059ed8;
2608 sha224->digest[1] = 0x367cd507;
2609 sha224->digest[2] = 0x3070dd17;
2610 sha224->digest[3] = 0xf70e5939;
2611 sha224->digest[4] = 0xffc00b31;
2612 sha224->digest[5] = 0x68581511;
2613 sha224->digest[6] = 0x64f98fa7;
2614 sha224->digest[7] = 0xbefa4fa4;
2615
2616 sha224->buffLen = 0;
2617 sha224->loLen = 0;
2618 sha224->hiLen = 0;
2619
2620 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
2621 /* choose best Transform function under this runtime environment */
2622 Sha256_SetTransform();
2623 #endif
2624
2625 return ret;
2626 }
2627
2628#endif
2629
2630#ifdef NEED_SOFT_SHA224
2631 int wc_InitSha224_ex(wc_Sha224* sha224, void* heap, int devId)
2632 {
2633 int ret = 0;
2634
2635 if (sha224 == NULL)
2636 return BAD_FUNC_ARG;
2637
2638 sha224->heap = heap;
2639
2640 ret = InitSha224(sha224);
2641 if (ret != 0)
2642 return ret;
2643
2644 #ifdef WOLFSSL_SMALL_STACK_CACHE
2645 sha224->W = NULL;
2646 #endif
2647
2648 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
2649 ret = wolfAsync_DevCtxInit(&sha224->asyncDev,
2650 WOLFSSL_ASYNC_MARKER_SHA224, sha224->heap, devId);
2651 #else
2652 (void)devId;
2653 #endif /* WOLFSSL_ASYNC_CRYPT */
2654
2655 return ret;
2656 }
2657
2658 int wc_Sha224Update(wc_Sha224* sha224, const byte* data, word32 len)
2659 {
2660 int ret;
2661
2662 if (sha224 == NULL || (data == NULL && len > 0)) {
2663 return BAD_FUNC_ARG;
2664 }
2665
2666 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
2667 if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) {
2668 #if defined(HAVE_INTEL_QA)
2669 return IntelQaSymSha224(&sha224->asyncDev, NULL, data, len);
2670 #endif
2671 }
2672 #endif /* WOLFSSL_ASYNC_CRYPT */
2673
2674 ret = Sha256Update((wc_Sha256*)sha224, data, len);
2675
2676 return ret;
2677 }
2678
2679 int wc_Sha224Final(wc_Sha224* sha224, byte* hash)
2680 {
2681 int ret;
2682
2683 if (sha224 == NULL || hash == NULL) {
2684 return BAD_FUNC_ARG;
2685 }
2686
2687 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
2688 if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) {
2689 #if defined(HAVE_INTEL_QA)
2690 return IntelQaSymSha224(&sha224->asyncDev, hash, NULL,
2691 WC_SHA224_DIGEST_SIZE);
2692 #endif
2693 }
2694 #endif /* WOLFSSL_ASYNC_CRYPT */
2695
2696 ret = Sha256Final((wc_Sha256*)sha224);
2697 if (ret != 0)
2698 return ret;
2699
2700 #if defined(LITTLE_ENDIAN_ORDER)
2701 ByteReverseWords(sha224->digest, sha224->digest, WC_SHA224_DIGEST_SIZE);
2702 #endif
2703 XMEMCPY(hash, sha224->digest, WC_SHA224_DIGEST_SIZE);
2704
2705 return InitSha224(sha224); /* reset state */
2706 }
2707#endif /* end of SHA224 software implementation */
2708
2709 int wc_InitSha224(wc_Sha224* sha224)
2710 {
2711 return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID);
2712 }
2713
2714 void wc_Sha224Free(wc_Sha224* sha224)
2715 {
2716 if (sha224 == NULL)
2717 return;
2718
2719#ifdef WOLFSSL_SMALL_STACK_CACHE
2720 if (sha224->W != NULL) {
2721 XFREE(sha224->W, NULL, DYNAMIC_TYPE_DIGEST);
2722 sha224->W = NULL;
2723 }
2724#endif
2725
2726 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
2727 wolfAsync_DevCtxFree(&sha224->asyncDev, WOLFSSL_ASYNC_MARKER_SHA224);
2728 #endif /* WOLFSSL_ASYNC_CRYPT */
2729
2730 #ifdef WOLFSSL_PIC32MZ_HASH
2731 wc_Sha256Pic32Free(sha224);
2732 #endif
2733 }
2734#endif /* WOLFSSL_SHA224 */
2735
2736
2737int wc_InitSha256(wc_Sha256* sha256)
2738{
2739 return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID);
2740}
2741
2742void wc_Sha256Free(wc_Sha256* sha256)
2743{
2744 if (sha256 == NULL)
2745 return;
2746
2747#ifdef WOLFSSL_SMALL_STACK_CACHE
2748 if (sha256->W != NULL) {
2749 XFREE(sha256->W, NULL, DYNAMIC_TYPE_DIGEST);
2750 sha256->W = NULL;
2751 }
2752#endif
2753
2754#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
2755 wolfAsync_DevCtxFree(&sha256->asyncDev, WOLFSSL_ASYNC_MARKER_SHA256);
2756#endif /* WOLFSSL_ASYNC_CRYPT */
2757#ifdef WOLFSSL_PIC32MZ_HASH
2758 wc_Sha256Pic32Free(sha256);
2759#endif
2760#if defined(WOLFSSL_AFALG_HASH)
2761 if (sha256->alFd > 0) {
2762 close(sha256->alFd);
2763 sha256->alFd = -1; /* avoid possible double close on socket */
2764 }
2765 if (sha256->rdFd > 0) {
2766 close(sha256->rdFd);
2767 sha256->rdFd = -1; /* avoid possible double close on socket */
2768 }
2769#endif /* WOLFSSL_AFALG_HASH */
2770#ifdef WOLFSSL_DEVCRYPTO_HASH
2771 wc_DevCryptoFree(&sha256->ctx);
2772#endif /* WOLFSSL_DEVCRYPTO */
2773#if defined(WOLFSSL_AFALG_HASH_KEEP) || \
2774 (defined(WOLFSSL_DEVCRYPTO_HASH) && defined(WOLFSSL_DEVCRYPTO_HASH_KEEP))
2775 if (sha256->msg != NULL) {
2776 XFREE(sha256->msg, sha256->heap, DYNAMIC_TYPE_TMP_BUFFER);
2777 sha256->msg = NULL;
2778 }
2779#endif
2780}
2781
2782#endif /* !WOLFSSL_TI_HASH */
2783#endif /* HAVE_FIPS */
2784
2785
2786#ifndef WOLFSSL_TI_HASH
2787#ifdef WOLFSSL_SHA224
2788 int wc_Sha224GetHash(wc_Sha224* sha224, byte* hash)
2789 {
2790 int ret;
2791 wc_Sha224 tmpSha224;
2792
2793 if (sha224 == NULL || hash == NULL)
2794 return BAD_FUNC_ARG;
2795
2796 ret = wc_Sha224Copy(sha224, &tmpSha224);
2797 if (ret == 0) {
2798 ret = wc_Sha224Final(&tmpSha224, hash);
2799 wc_Sha224Free(&tmpSha224);
2800 }
2801 return ret;
2802 }
2803 int wc_Sha224Copy(wc_Sha224* src, wc_Sha224* dst)
2804 {
2805 int ret = 0;
2806
2807 if (src == NULL || dst == NULL)
2808 return BAD_FUNC_ARG;
2809
2810 XMEMCPY(dst, src, sizeof(wc_Sha224));
2811 #ifdef WOLFSSL_SMALL_STACK_CACHE
2812 dst->W = NULL;
2813 #endif
2814
2815 #ifdef WOLFSSL_ASYNC_CRYPT
2816 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
2817 #endif
2818
2819 return ret;
2820 }
2821#endif /* WOLFSSL_SHA224 */
2822
2823#ifdef WOLFSSL_AFALG_HASH
2824 /* implemented in wolfcrypt/src/port/af_alg/afalg_hash.c */
2825
2826#elif defined(WOLFSSL_DEVCRYPTO_HASH)
2827 /* implemented in wolfcrypt/src/port/devcrypto/devcrypt_hash.c */
2828
2829#else
2830
2831int wc_Sha256GetHash(wc_Sha256* sha256, byte* hash)
2832{
2833 int ret;
2834 wc_Sha256 tmpSha256;
2835
2836 if (sha256 == NULL || hash == NULL)
2837 return BAD_FUNC_ARG;
2838
2839 ret = wc_Sha256Copy(sha256, &tmpSha256);
2840 if (ret == 0) {
2841 ret = wc_Sha256Final(&tmpSha256, hash);
2842 wc_Sha256Free(&tmpSha256);
2843 }
2844 return ret;
2845}
2846int wc_Sha256Copy(wc_Sha256* src, wc_Sha256* dst)
2847{
2848 int ret = 0;
2849
2850 if (src == NULL || dst == NULL)
2851 return BAD_FUNC_ARG;
2852
2853 XMEMCPY(dst, src, sizeof(wc_Sha256));
2854#ifdef WOLFSSL_SMALL_STACK_CACHE
2855 dst->W = NULL;
2856#endif
2857
2858#ifdef WOLFSSL_ASYNC_CRYPT
2859 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
2860#endif
2861#ifdef WOLFSSL_PIC32MZ_HASH
2862 ret = wc_Pic32HashCopy(&src->cache, &dst->cache);
2863#endif
2864
2865 return ret;
2866}
2867#endif
2868#endif /* !WOLFSSL_TI_HASH */
2869
2870#endif /* NO_SHA256 */
Note: See TracBrowser for help on using the repository browser.