source: asp3_tinet_ecnl_arm/trunk/wolfssl-3.12.2/wolfcrypt/src/sha256.c@ 352

Last change on this file since 352 was 352, checked in by coas-nagasima, 6 years ago

arm向けASP3版ECNLを追加

  • Property svn:eol-style set to native
  • Property svn:mime-type set to text/x-csrc;charset=UTF-8
File size: 84.8 KB
Line 
1/* sha256.c
2 *
3 * Copyright (C) 2006-2017 wolfSSL Inc.
4 *
5 * This file is part of wolfSSL.
6 *
7 * wolfSSL is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * wolfSSL is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
20 */
21
22
23/* code submitted by raphael.huck@efixo.com */
24
25#ifdef HAVE_CONFIG_H
26 #include <config.h>
27#endif
28
29#include <wolfssl/wolfcrypt/settings.h>
30
31#if !defined(NO_SHA256)
32
33#include <wolfssl/wolfcrypt/sha256.h>
34#include <wolfssl/wolfcrypt/error-crypt.h>
35#include <wolfssl/wolfcrypt/cpuid.h>
36
37/* fips wrapper calls, user can call direct */
38#ifdef HAVE_FIPS
39
40 int wc_InitSha256(wc_Sha256* sha)
41 {
42 if (sha == NULL) {
43 return BAD_FUNC_ARG;
44 }
45 return InitSha256_fips(sha);
46 }
47 int wc_InitSha256_ex(wc_Sha256* sha, void* heap, int devId)
48 {
49 (void)heap;
50 (void)devId;
51 if (sha == NULL) {
52 return BAD_FUNC_ARG;
53 }
54 return InitSha256_fips(sha);
55 }
56 int wc_Sha256Update(wc_Sha256* sha, const byte* data, word32 len)
57 {
58 if (sha == NULL || (data == NULL && len > 0)) {
59 return BAD_FUNC_ARG;
60 }
61 return Sha256Update_fips(sha, data, len);
62 }
63 int wc_Sha256Final(wc_Sha256* sha, byte* out)
64 {
65 if (sha == NULL || out == NULL) {
66 return BAD_FUNC_ARG;
67 }
68 return Sha256Final_fips(sha, out);
69 }
70 void wc_Sha256Free(wc_Sha256* sha)
71 {
72 (void)sha;
73 /* Not supported in FIPS */
74 }
75
76#else /* else build without fips */
77
78
79#if defined(WOLFSSL_TI_HASH)
80 /* #include <wolfcrypt/src/port/ti/ti-hash.c> included by wc_port.c */
81#else
82
83#include <wolfssl/wolfcrypt/logging.h>
84
85#ifdef NO_INLINE
86 #include <wolfssl/wolfcrypt/misc.h>
87#else
88 #define WOLFSSL_MISC_INCLUDED
89 #include <wolfcrypt/src/misc.c>
90#endif
91
92
93#if defined(USE_INTEL_SPEEDUP)
94 #define HAVE_INTEL_AVX1
95 #define HAVE_INTEL_AVX2
96#endif /* USE_INTEL_SPEEDUP */
97
98#if defined(HAVE_INTEL_AVX2)
99 #define HAVE_INTEL_RORX
100#endif
101
102
103static INLINE void AddLength(wc_Sha256* sha256, word32 len);
104
105#if !defined(WOLFSSL_PIC32MZ_HASH) && !defined(STM32_HASH)
106static int InitSha256(wc_Sha256* sha256)
107{
108 int ret = 0;
109
110 if (sha256 == NULL)
111 return BAD_FUNC_ARG;
112
113 XMEMSET(sha256->digest, 0, sizeof(sha256->digest));
114 sha256->digest[0] = 0x6A09E667L;
115 sha256->digest[1] = 0xBB67AE85L;
116 sha256->digest[2] = 0x3C6EF372L;
117 sha256->digest[3] = 0xA54FF53AL;
118 sha256->digest[4] = 0x510E527FL;
119 sha256->digest[5] = 0x9B05688CL;
120 sha256->digest[6] = 0x1F83D9ABL;
121 sha256->digest[7] = 0x5BE0CD19L;
122
123 sha256->buffLen = 0;
124 sha256->loLen = 0;
125 sha256->hiLen = 0;
126
127 return ret;
128}
129#endif
130
131
132/* Hardware Acceleration */
133#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
134
135 /* in case intel instructions aren't available, plus we need the K[] global */
136 #define NEED_SOFT_SHA256
137
138 /*****
139 Intel AVX1/AVX2 Macro Control Structure
140
141 #define HAVE_INTEL_AVX1
142 #define HAVE_INTEL_AVX2
143
144 #define HAVE_INTEL_RORX
145
146
147 int InitSha256(wc_Sha256* sha256) {
148 Save/Recover XMM, YMM
149 ...
150 }
151
152 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
153 Transform(); Function prototype
154 #else
155 Transform() { }
156 int Sha256Final() {
157 Save/Recover XMM, YMM
158 ...
159 }
160 #endif
161
162 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
163 #if defined(HAVE_INTEL_RORX
164 #define RND with rorx instuction
165 #else
166 #define RND
167 #endif
168 #endif
169
170 #if defined(HAVE_INTEL_AVX1)
171
172 #define XMM Instructions/inline asm
173
174 int Transform() {
175 Stitched Message Sched/Round
176 }
177
178 #elif defined(HAVE_INTEL_AVX2)
179
180 #define YMM Instructions/inline asm
181
182 int Transform() {
183 More granural Stitched Message Sched/Round
184 }
185
186 #endif
187
188 */
189
190 /* Each platform needs to query info type 1 from cpuid to see if aesni is
191 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
192 */
193
194 /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha256 */
195 static int Transform(wc_Sha256* sha256);
196 #if defined(HAVE_INTEL_AVX1)
197 static int Transform_AVX1(wc_Sha256 *sha256);
198 #endif
199 #if defined(HAVE_INTEL_AVX2)
200 static int Transform_AVX2(wc_Sha256 *sha256);
201 static int Transform_AVX1_RORX(wc_Sha256 *sha256);
202 #endif
203 static int (*Transform_p)(wc_Sha256* sha256) /* = _Transform */;
204 static int transform_check = 0;
205 static word32 intel_flags;
206 #define XTRANSFORM(S, B) (*Transform_p)((S))
207
208 static void Sha256_SetTransform(void)
209 {
210
211 if (transform_check)
212 return;
213
214 intel_flags = cpuid_get_flags();
215
216 #if defined(HAVE_INTEL_AVX2)
217 if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_BMI2(intel_flags)) {
218 if (1)
219 Transform_p = Transform_AVX1_RORX;
220 else
221 Transform_p = Transform_AVX2;
222 }
223 else
224 #endif
225 #if defined(HAVE_INTEL_AVX1)
226 if (1) {
227 Transform_p = ((IS_INTEL_AVX1(intel_flags)) ? Transform_AVX1 :
228 Transform);
229 }
230 else
231 #endif
232 Transform_p = Transform;
233
234 transform_check = 1;
235 }
236
237 /* Dummy for saving MM_REGs on behalf of Transform */
238 #if defined(HAVE_INTEL_AVX2) && !defined(HAVE_INTEL_AVX1)
239 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
240 "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15")
241 #elif defined(HAVE_INTEL_AVX1)
242 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
243 "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\
244 "xmm11","xmm12","xmm13","xmm14","xmm15")
245 #endif
246
247 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
248 {
249 int ret = 0;
250 if (sha256 == NULL)
251 return BAD_FUNC_ARG;
252
253 sha256->heap = heap;
254
255 ret = InitSha256(sha256);
256 if (ret != 0)
257 return ret;
258
259 /* choose best Transform function under this runtime environment */
260 Sha256_SetTransform();
261
262 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
263 ret = wolfAsync_DevCtxInit(&sha256->asyncDev,
264 WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId);
265 #else
266 (void)devId;
267 #endif /* WOLFSSL_ASYNC_CRYPT */
268
269 return ret;
270 }
271
272#elif defined(FREESCALE_LTC_SHA)
273 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
274 {
275 (void)heap;
276 (void)devId;
277
278 LTC_HASH_Init(LTC_BASE, &sha256->ctx, kLTC_Sha256, NULL, 0);
279
280 return 0;
281 }
282
283#elif defined(FREESCALE_MMCAU_SHA)
284
285 #ifdef FREESCALE_MMCAU_CLASSIC_SHA
286 #include "cau_api.h"
287 #else
288 #include "fsl_mmcau.h"
289 #endif
290
291 #define XTRANSFORM(S, B) Transform((S), (B))
292
293 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
294 {
295 int ret = 0;
296
297 (void)heap;
298 (void)devId;
299
300 ret = wolfSSL_CryptHwMutexLock();
301 if (ret != 0) {
302 return ret;
303 }
304 #ifdef FREESCALE_MMCAU_CLASSIC_SHA
305 cau_sha256_initialize_output(sha256->digest);
306 #else
307 MMCAU_SHA256_InitializeOutput((uint32_t*)sha256->digest);
308 #endif
309 wolfSSL_CryptHwMutexUnLock();
310
311 sha256->buffLen = 0;
312 sha256->loLen = 0;
313 sha256->hiLen = 0;
314
315 return ret;
316 }
317
318 static int Transform(wc_Sha256* sha256, byte* buf)
319 {
320 int ret = wolfSSL_CryptHwMutexLock();
321 if (ret == 0) {
322 #ifdef FREESCALE_MMCAU_CLASSIC_SHA
323 cau_sha256_hash_n(buf, 1, sha256->digest);
324 #else
325 MMCAU_SHA256_HashN(buf, 1, sha256->digest);
326 #endif
327 wolfSSL_CryptHwMutexUnLock();
328 }
329 return ret;
330 }
331
332#elif defined(WOLFSSL_PIC32MZ_HASH)
333 #include <wolfssl/wolfcrypt/port/pic32/pic32mz-crypt.h>
334
335#elif defined(STM32_HASH)
336
337 /*
338 * STM32F2/F4/F7 hardware SHA256 support through the HASH_* API's from the
339 * Standard Peripheral Library or CubeMX (See note in README).
340 */
341
342 /* STM32 register size, bytes */
343 #ifdef WOLFSSL_STM32_CUBEMX
344 #define SHA256_REG_SIZE SHA256_BLOCK_SIZE
345 #else
346 #define SHA256_REG_SIZE 4
347 /* STM32 struct notes:
348 * sha256->buffer = first 4 bytes used to hold partial block if needed
349 * sha256->buffLen = num bytes currently stored in sha256->buffer
350 * sha256->loLen = num bytes that have been written to STM32 FIFO
351 */
352 #endif
353 #define SHA256_HW_TIMEOUT 0xFF
354
355 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
356 {
357 if (sha256 == NULL)
358 return BAD_FUNC_ARG;
359
360 sha256->heap = heap;
361 XMEMSET(sha256->buffer, 0, sizeof(sha256->buffer));
362 sha256->buffLen = 0;
363 sha256->loLen = 0;
364 sha256->hiLen = 0;
365
366 /* initialize HASH peripheral */
367 #ifdef WOLFSSL_STM32_CUBEMX
368 HAL_HASH_DeInit(&sha256->hashHandle);
369 sha256->hashHandle.Init.DataType = HASH_DATATYPE_8B;
370 if (HAL_HASH_Init(&sha256->hashHandle) != HAL_OK) {
371 return ASYNC_INIT_E;
372 }
373 /* reset the hash control register */
374 /* required because Cube MX is not clearing algo bits */
375 HASH->CR &= ~HASH_CR_ALGO;
376 #else
377 HASH_DeInit();
378
379 /* reset the hash control register */
380 HASH->CR &= ~ (HASH_CR_ALGO | HASH_CR_DATATYPE | HASH_CR_MODE);
381
382 /* configure algo used, algo mode, datatype */
383 HASH->CR |= (HASH_AlgoSelection_SHA256 | HASH_AlgoMode_HASH
384 | HASH_DataType_8b);
385
386 /* reset HASH processor */
387 HASH->CR |= HASH_CR_INIT;
388 #endif
389
390 return 0;
391 }
392
393 int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
394 {
395 int ret = 0;
396 byte* local;
397
398 if (sha256 == NULL || (data == NULL && len > 0)) {
399 return BAD_FUNC_ARG;
400 }
401
402 /* do block size increments */
403 local = (byte*)sha256->buffer;
404
405 /* check that internal buffLen is valid */
406 if (sha256->buffLen >= SHA256_REG_SIZE)
407 return BUFFER_E;
408
409 while (len) {
410 word32 add = min(len, SHA256_REG_SIZE - sha256->buffLen);
411 XMEMCPY(&local[sha256->buffLen], data, add);
412
413 sha256->buffLen += add;
414 data += add;
415 len -= add;
416
417 if (sha256->buffLen == SHA256_REG_SIZE) {
418 #ifdef WOLFSSL_STM32_CUBEMX
419 if (HAL_HASHEx_SHA256_Accumulate(
420 &sha256->hashHandle, local, SHA256_REG_SIZE) != HAL_OK) {
421 ret = ASYNC_OP_E;
422 }
423 #else
424 HASH_DataIn(*(uint32_t*)local);
425 #endif
426
427 AddLength(sha256, SHA256_REG_SIZE);
428 sha256->buffLen = 0;
429 }
430 }
431 return ret;
432 }
433
434 int wc_Sha256Final(wc_Sha256* sha256, byte* hash)
435 {
436 int ret = 0;
437
438 if (sha256 == NULL || hash == NULL)
439 return BAD_FUNC_ARG;
440
441 #ifdef WOLFSSL_STM32_CUBEMX
442 if (HAL_HASHEx_SHA256_Start(&sha256->hashHandle,
443 (byte*)sha256->buffer, sha256->buffLen,
444 (byte*)sha256->digest, SHA256_HW_TIMEOUT) != HAL_OK) {
445 ret = ASYNC_OP_E;
446 }
447 #else
448 __IO uint16_t nbvalidbitsdata = 0;
449
450 /* finish reading any trailing bytes into FIFO */
451 if (sha256->buffLen > 0) {
452 HASH_DataIn(*(uint32_t*)sha256->buffer);
453 AddLength(sha256, sha256->buffLen);
454 }
455
456 /* calculate number of valid bits in last word of input data */
457 nbvalidbitsdata = 8 * (sha256->loLen % SHA256_REG_SIZE);
458
459 /* configure number of valid bits in last word of the data */
460 HASH_SetLastWordValidBitsNbr(nbvalidbitsdata);
461
462 /* start HASH processor */
463 HASH_StartDigest();
464
465 /* wait until Busy flag == RESET */
466 while (HASH_GetFlagStatus(HASH_FLAG_BUSY) != RESET) {}
467
468 /* read message digest */
469 sha256->digest[0] = HASH->HR[0];
470 sha256->digest[1] = HASH->HR[1];
471 sha256->digest[2] = HASH->HR[2];
472 sha256->digest[3] = HASH->HR[3];
473 sha256->digest[4] = HASH->HR[4];
474 sha256->digest[5] = HASH_DIGEST->HR[5];
475 sha256->digest[6] = HASH_DIGEST->HR[6];
476 sha256->digest[7] = HASH_DIGEST->HR[7];
477
478 ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE);
479 #endif /* WOLFSSL_STM32_CUBEMX */
480
481 XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE);
482
483 (void)wc_InitSha256_ex(sha256, sha256->heap, INVALID_DEVID);
484
485 return ret;
486 }
487
488#else
489 #define NEED_SOFT_SHA256
490
491 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
492 {
493 int ret = 0;
494 if (sha256 == NULL)
495 return BAD_FUNC_ARG;
496
497 sha256->heap = heap;
498
499 ret = InitSha256(sha256);
500 if (ret != 0)
501 return ret;
502
503 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
504 ret = wolfAsync_DevCtxInit(&sha256->asyncDev,
505 WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId);
506 #else
507 (void)devId;
508 #endif /* WOLFSSL_ASYNC_CRYPT */
509
510 return ret;
511 }
512#endif /* End Hardware Acceleration */
513
514#ifndef SAVE_XMM_YMM
515 #define SAVE_XMM_YMM
516#endif
517
518#ifdef NEED_SOFT_SHA256
519
520 static const ALIGN32 word32 K[64] = {
521 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
522 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
523 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
524 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
525 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
526 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
527 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
528 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
529 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
530 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
531 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
532 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
533 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
534 };
535
536 #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
537 #define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y)))
538 #define R(x, n) (((x) & 0xFFFFFFFFU) >> (n))
539
540 #define S(x, n) rotrFixed(x, n)
541 #define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
542 #define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
543 #define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
544 #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
545
546 #define RND(a,b,c,d,e,f,g,h,i) \
547 t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \
548 t1 = Sigma0((a)) + Maj((a), (b), (c)); \
549 (d) += t0; \
550 (h) = t0 + t1;
551
552 #ifndef XTRANSFORM
553 #define XTRANSFORM(S, B) Transform((S))
554 #endif
555
556 static int Transform(wc_Sha256* sha256)
557 {
558 word32 S[8], t0, t1;
559 int i;
560
561 #ifdef WOLFSSL_SMALL_STACK
562 word32* W;
563
564 W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
565 DYNAMIC_TYPE_TMP_BUFFER);
566 if (W == NULL)
567 return MEMORY_E;
568 #else
569 word32 W[WC_SHA256_BLOCK_SIZE];
570 #endif
571
572 /* Copy context->state[] to working vars */
573 for (i = 0; i < 8; i++)
574 S[i] = sha256->digest[i];
575
576 for (i = 0; i < 16; i++)
577 W[i] = sha256->buffer[i];
578
579 for (i = 16; i < WC_SHA256_BLOCK_SIZE; i++)
580 W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
581
582 for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
583 RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0);
584 RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1);
585 RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2);
586 RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3);
587 RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4);
588 RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5);
589 RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6);
590 RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7);
591 }
592
593 /* Add the working vars back into digest state[] */
594 for (i = 0; i < 8; i++) {
595 sha256->digest[i] += S[i];
596 }
597
598 #ifdef WOLFSSL_SMALL_STACK
599 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
600 #endif
601
602 return 0;
603 }
604#endif
605/* End wc_ software implementation */
606
607
608#if defined(XTRANSFORM) || defined(STM32_HASH)
609static INLINE void AddLength(wc_Sha256* sha256, word32 len)
610{
611 word32 tmp = sha256->loLen;
612 if ( (sha256->loLen += len) < tmp)
613 sha256->hiLen++; /* carry low to high */
614}
615#endif
616
617
618#ifdef XTRANSFORM
619
620 static INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
621 {
622 int ret = 0;
623 byte* local;
624
625 if (sha256 == NULL || (data == NULL && len > 0)) {
626 return BAD_FUNC_ARG;
627 }
628
629 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
630 if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) {
631 #if defined(HAVE_INTEL_QA)
632 return IntelQaSymSha256(&sha256->asyncDev, NULL, data, len);
633 #endif
634 }
635 #endif /* WOLFSSL_ASYNC_CRYPT */
636
637 /* do block size increments */
638 local = (byte*)sha256->buffer;
639
640 /* check that internal buffLen is valid */
641 if (sha256->buffLen >= WC_SHA256_BLOCK_SIZE)
642 return BUFFER_E;
643
644 SAVE_XMM_YMM; /* for Intel AVX */
645
646 while (len) {
647 word32 add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
648 XMEMCPY(&local[sha256->buffLen], data, add);
649
650 sha256->buffLen += add;
651 data += add;
652 len -= add;
653
654 if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) {
655 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
656 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
657 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
658 #endif
659 {
660 ByteReverseWords(sha256->buffer, sha256->buffer,
661 WC_SHA256_BLOCK_SIZE);
662 }
663 #endif
664 ret = XTRANSFORM(sha256, local);
665 if (ret != 0) {
666 break;
667 }
668
669 AddLength(sha256, WC_SHA256_BLOCK_SIZE);
670 sha256->buffLen = 0;
671 }
672 }
673
674 return ret;
675 }
676
677 int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
678 {
679 return Sha256Update(sha256, data, len);
680 }
681
682 static INLINE int Sha256Final(wc_Sha256* sha256)
683 {
684
685 int ret;
686 byte* local = (byte*)sha256->buffer;
687
688 if (sha256 == NULL) {
689 return BAD_FUNC_ARG;
690 }
691
692 SAVE_XMM_YMM; /* for Intel AVX */
693
694 AddLength(sha256, sha256->buffLen); /* before adding pads */
695 local[sha256->buffLen++] = 0x80; /* add 1 */
696
697 /* pad with zeros */
698 if (sha256->buffLen > WC_SHA256_PAD_SIZE) {
699 XMEMSET(&local[sha256->buffLen], 0,
700 WC_SHA256_BLOCK_SIZE - sha256->buffLen);
701 sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen;
702
703 {
704 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
705 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
706 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
707 #endif
708 {
709 ByteReverseWords(sha256->buffer, sha256->buffer,
710 WC_SHA256_BLOCK_SIZE);
711 }
712 #endif
713 }
714
715 ret = XTRANSFORM(sha256, local);
716 if (ret != 0)
717 return ret;
718
719 sha256->buffLen = 0;
720 }
721 XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_PAD_SIZE - sha256->buffLen);
722
723 /* put lengths in bits */
724 sha256->hiLen = (sha256->loLen >> (8 * sizeof(sha256->loLen) - 3)) +
725 (sha256->hiLen << 3);
726 sha256->loLen = sha256->loLen << 3;
727
728 /* store lengths */
729 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
730 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
731 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
732 #endif
733 {
734 ByteReverseWords(sha256->buffer, sha256->buffer,
735 WC_SHA256_BLOCK_SIZE);
736 }
737 #endif
738 /* ! length ordering dependent on digest endian type ! */
739 XMEMCPY(&local[WC_SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
740 XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
741 sizeof(word32));
742
743 #if defined(FREESCALE_MMCAU_SHA) || defined(HAVE_INTEL_AVX1) || \
744 defined(HAVE_INTEL_AVX2)
745 /* Kinetis requires only these bytes reversed */
746 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
747 if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
748 #endif
749 {
750 ByteReverseWords(
751 &sha256->buffer[WC_SHA256_PAD_SIZE / sizeof(word32)],
752 &sha256->buffer[WC_SHA256_PAD_SIZE / sizeof(word32)],
753 2 * sizeof(word32));
754 }
755 #endif
756
757 return XTRANSFORM(sha256, local);
758 }
759
760 int wc_Sha256Final(wc_Sha256* sha256, byte* hash)
761 {
762 int ret;
763
764 if (sha256 == NULL || hash == NULL) {
765 return BAD_FUNC_ARG;
766 }
767
768 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
769 if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) {
770 #if defined(HAVE_INTEL_QA)
771 return IntelQaSymSha256(&sha256->asyncDev, hash, NULL,
772 WC_SHA256_DIGEST_SIZE);
773 #endif
774 }
775 #endif /* WOLFSSL_ASYNC_CRYPT */
776
777 ret = Sha256Final(sha256);
778 if (ret != 0)
779 return ret;
780
781 #if defined(LITTLE_ENDIAN_ORDER)
782 ByteReverseWords(sha256->digest, sha256->digest, WC_SHA256_DIGEST_SIZE);
783 #endif
784 XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
785
786 return InitSha256(sha256); /* reset state */
787 }
788
789#endif /* XTRANSFORM */
790
791
792#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
793
794#define _DigestToReg(S0, S1, S2, S3, S4, S5, S6, S7) \
795 "leaq %[digest], %%r8\n\t" \
796 "movl (%%r8), %"#S0"\n\t" \
797 "movl 4(%%r8), %"#S1"\n\t" \
798 "movl 8(%%r8), %"#S2"\n\t" \
799 "movl 12(%%r8), %"#S3"\n\t" \
800 "movl 16(%%r8), %"#S4"\n\t" \
801 "movl 20(%%r8), %"#S5"\n\t" \
802 "movl 24(%%r8), %"#S6"\n\t" \
803 "movl 28(%%r8), %"#S7"\n\t"
804
805#define _RegToDigest(S0, S1, S2, S3, S4, S5, S6, S7) \
806 "leaq %[digest], %%r8\n\t" \
807 "addl %"#S0", (%%r8)\n\t" \
808 "addl %"#S1", 4(%%r8)\n\t" \
809 "addl %"#S2", 8(%%r8)\n\t" \
810 "addl %"#S3", 12(%%r8)\n\t" \
811 "addl %"#S4", 16(%%r8)\n\t" \
812 "addl %"#S5", 20(%%r8)\n\t" \
813 "addl %"#S6", 24(%%r8)\n\t" \
814 "addl %"#S7", 28(%%r8)\n\t"
815
816#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
817 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
818
819#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
820 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
821
822
823#define S_0 %r15d
824#define S_1 %r10d
825#define S_2 %r11d
826#define S_3 %r12d
827#define S_4 %r13d
828#define S_5 %r14d
829#define S_6 %ebx
830#define S_7 %r9d
831
832#define SSE_REGs "%edi", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15"
833
834#if defined(HAVE_INTEL_RORX)
835#define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i) \
836 "# edx = e>>>6\n\t" \
837 "rorx $6, %"#e", %%edx\n\t"
838
839#define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i) \
840 "# edi = e>>>11\n\t" \
841 "rorx $11, %"#e",%%edi\n\t" \
842 "# edi = (e>>11) ^ (e>>6)\n\t" \
843 "xorl %%edx, %%edi\n\t" \
844 "# edx = e>>>25\n\t" \
845 "rorx $25, %"#e", %%edx\n\t"
846
847#define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i) \
848 "# esi = f\n\t" \
849 "movl %"#f", %%esi\n\t" \
850 "# esi = f ^ g\n\t" \
851 "xorl %"#g", %%esi\n\t" \
852 "# edx = Sigma1(e)\n\t" \
853 "xorl %%edi, %%edx\n\t" \
854 "# esi = (f ^ g) & e\n\t" \
855 "andl %"#e", %%esi\n\t" \
856 "# esi = Ch(e,f,g)\n\t" \
857 "xorl %"#g", %%esi\n\t"
858
859#define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i) \
860 "# h += w_k\n\t" \
861 "leaq %[W_K], %%r8\n\t" \
862 "addl ("#i")*4(%%r8), %"#h"\n\t" \
863 "# h = h + w_k + Sigma1(e)\n\t" \
864 "addl %%edx, %"#h"\n\t" \
865 "# r8d = a>>>2\n\t" \
866 "rorx $2, %"#a", %%r8d\n\t" \
867 "# edi = a>>>13\n\t" \
868 "rorx $13, %"#a", %%edi\n\t"
869
870#define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i) \
871 "# edx = a>>22\n\t" \
872 "rorx $22, %"#a", %%edx\n\t" \
873 "# edi = (a>>>2) ^ (a>>>13)\n\t" \
874 "xorl %%r8d, %%edi\n\t" \
875 "# edx = Sigma0(a)\n\t" \
876 "xorl %%edi, %%edx\n\t"
877
878#define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i) \
879 "# edi = b\n\t" \
880 "movl %"#b", %%edi\n\t" \
881 "# edi = a | b\n\t" \
882 "orl %"#a", %%edi\n\t" \
883 "# edi = (a | b) & c\n\t" \
884 "andl %"#c", %%edi\n\t" \
885 "# r8d = b\n\t" \
886 "movl %"#b", %%r8d\n\t"
887
888#define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i) \
889 "# h += Ch(e,f,g)\n\t" \
890 "addl %%esi, %"#h"\n\t" \
891 "# r8d = b & a\n\t" \
892 "andl %"#a", %%r8d\n\t" \
893 "# r8d = Maj(a,b,c)\n\t" \
894 "orl %%edi, %%r8d\n\t"
895
896#define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i) \
897 "# d += h + w_k + Sigma1(e) + Ch(e,f,g)\n\t" \
898 "addl %"#h", %"#d"\n\t" \
899 "addl %"#h", %%r8d\n\t" \
900 "addl %%edx, %%r8d\n\t" \
901 "movl %%r8d, %"#h"\n\t"
902#endif /* HAVE_INTEL_RORX */
903
904#define RND_STEP_1(a,b,c,d,e,f,g,h,i) \
905 "movl %"#e", %%edx\n\t" \
906 "# edx = e>>>6\n\t" \
907 "roll $26, %%edx\n\t" \
908 "movl %"#e", %%edi\n\t"
909
910#define RND_STEP_2(a,b,c,d,e,f,g,h,i) \
911 "# edi = e>>>11\n\t" \
912 "roll $21, %%edi\n\t" \
913 "# edi = (e>>11) ^ (e>>6)\n\t" \
914 "xorl %%edx, %%edi\n\t" \
915 "# edx = e\n\t" \
916 "movl %"#e", %%edx\n\t" \
917 "# edx = e>>>25\n\t" \
918 "roll $7, %%edx\n\t"
919
920#define RND_STEP_3(a,b,c,d,e,f,g,h,i) \
921 "# esi = f\n\t" \
922 "movl %"#f", %%esi\n\t" \
923 "# esi = f ^ g\n\t" \
924 "xorl %"#g", %%esi\n\t" \
925 "# edx = Sigma1(e)\n\t" \
926 "xorl %%edi, %%edx\n\t" \
927 "# esi = (f ^ g) & e\n\t" \
928 "andl %"#e", %%esi\n\t" \
929 "# esi = Ch(e,f,g)\n\t" \
930 "xorl %"#g", %%esi\n\t"
931
932#define RND_STEP_4(a,b,c,d,e,f,g,h,i) \
933 "# h += w_k\n\t" \
934 "leaq %[W_K], %%r8\n\t" \
935 "addl ("#i")*4(%%r8), %"#h"\n\t" \
936 "# h = h + w_k + Sigma1(e)\n\t" \
937 "addl %%edx, %"#h"\n\t" \
938 "# r8d = a\n\t" \
939 "movl %"#a", %%r8d\n\t" \
940 "# r8d = a>>>2\n\t" \
941 "roll $30, %%r8d\n\t" \
942 "# edi = a\n\t" \
943 "movl %"#a", %%edi\n\t" \
944 "# edi = a>>>13\n\t" \
945 "roll $19, %%edi\n\t" \
946 "# edx = a\n\t" \
947 "movl %"#a", %%edx\n\t"
948
949#define RND_STEP_5(a,b,c,d,e,f,g,h,i) \
950 "# edx = a>>>22\n\t" \
951 "roll $10, %%edx\n\t" \
952 "# edi = (a>>>2) ^ (a>>>13)\n\t" \
953 "xorl %%r8d, %%edi\n\t" \
954 "# edx = Sigma0(a)\n\t" \
955 "xorl %%edi, %%edx\n\t"
956
957#define RND_STEP_6(a,b,c,d,e,f,g,h,i) \
958 "# edi = b\n\t" \
959 "movl %"#b", %%edi\n\t" \
960 "# edi = a | b\n\t" \
961 "orl %"#a", %%edi\n\t" \
962 "# edi = (a | b) & c\n\t" \
963 "andl %"#c", %%edi\n\t" \
964 "# r8d = b\n\t" \
965 "movl %"#b", %%r8d\n\t"
966
967#define RND_STEP_7(a,b,c,d,e,f,g,h,i) \
968 "# h += Ch(e,f,g)\n\t" \
969 "addl %%esi, %"#h"\n\t" \
970 "#r8d = b & a\n\t" \
971 "andl %"#a", %%r8d\n\t" \
972 "# r8d = Maj(a,b,c)\n\t" \
973 "orl %%edi, %%r8d\n\t"
974
975#define RND_STEP_8(a,b,c,d,e,f,g,h,i) \
976 "# d += h + w_k + Sigma1(e) + Ch(e,f,g)\n\t" \
977 "addl %"#h", %"#d"\n\t" \
978 "# r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c)\n\t" \
979 "addl %"#h", %%r8d\n\t" \
980 "# r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c)\n\t" \
981 "addl %%edx, %%r8d\n\t" \
982 "# h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c)\n\t" \
983 "movl %%r8d, %"#h"\n\t"
984
985#define RND_X(a,b,c,d,e,f,g,h,i) \
986 RND_STEP_1(a,b,c,d,e,f,g,h,i) \
987 RND_STEP_2(a,b,c,d,e,f,g,h,i) \
988 RND_STEP_3(a,b,c,d,e,f,g,h,i) \
989 RND_STEP_4(a,b,c,d,e,f,g,h,i) \
990 RND_STEP_5(a,b,c,d,e,f,g,h,i) \
991 RND_STEP_6(a,b,c,d,e,f,g,h,i) \
992 RND_STEP_7(a,b,c,d,e,f,g,h,i) \
993 RND_STEP_8(a,b,c,d,e,f,g,h,i)
994
995#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i)
996#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i)
997#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i)
998#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i)
999#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i)
1000#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i)
1001#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i)
1002#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i)
1003
1004
1005#define RND_1_3(a,b,c,d,e,f,g,h,i) \
1006 RND_STEP_1(a,b,c,d,e,f,g,h,i) \
1007 RND_STEP_2(a,b,c,d,e,f,g,h,i) \
1008 RND_STEP_3(a,b,c,d,e,f,g,h,i)
1009
1010#define RND_4_6(a,b,c,d,e,f,g,h,i) \
1011 RND_STEP_4(a,b,c,d,e,f,g,h,i) \
1012 RND_STEP_5(a,b,c,d,e,f,g,h,i) \
1013 RND_STEP_6(a,b,c,d,e,f,g,h,i)
1014
1015#define RND_7_8(a,b,c,d,e,f,g,h,i) \
1016 RND_STEP_7(a,b,c,d,e,f,g,h,i) \
1017 RND_STEP_8(a,b,c,d,e,f,g,h,i)
1018
1019#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i)
1020#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i)
1021#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i)
1022#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i)
1023#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i)
1024#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i)
1025#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i)
1026#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i)
1027
1028
1029#define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i)
1030#define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i)
1031#define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i)
1032#define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i)
1033#define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i)
1034#define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i)
1035#define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i)
1036#define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i)
1037
1038#define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i)
1039#define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i)
1040#define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i)
1041#define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i)
1042#define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i)
1043#define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i)
1044#define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i)
1045#define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i)
1046
1047#define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i)
1048#define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i)
1049#define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i)
1050#define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i)
1051#define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i)
1052#define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i)
1053#define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i)
1054#define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i)
1055
1056#define FOR(cnt, init, max, inc, loop) \
1057 __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):)
1058#define END(cnt, init, max, inc, loop) \
1059 __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::);
1060
1061#endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */
1062
1063#if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */
1064
1065#define VPALIGNR(op1,op2,op3,op4) \
1066 "vpalignr $"#op4", %"#op3", %"#op2", %"#op1"\n\t"
1067#define VPADDD(op1,op2,op3) \
1068 "vpaddd %"#op3", %"#op2", %"#op1"\n\t"
1069#define VPSRLD(op1,op2,op3) \
1070 "vpsrld $"#op3", %"#op2", %"#op1"\n\t"
1071#define VPSRLQ(op1,op2,op3) \
1072 "vpsrlq $"#op3", %"#op2", %"#op1"\n\t"
1073#define VPSLLD(op1,op2,op3) \
1074 "vpslld $"#op3", %"#op2", %"#op1"\n\t"
1075#define VPOR(op1,op2,op3) \
1076 "vpor %"#op3", %"#op2", %"#op1"\n\t"
1077#define VPXOR(op1,op2,op3) \
1078 "vpxor %"#op3", %"#op2", %"#op1"\n\t"
1079#define VPSHUFD(op1,op2,op3) \
1080 "vpshufd $"#op3", %"#op2", %"#op1"\n\t"
1081#define VPSHUFB(op1,op2,op3) \
1082 "vpshufb %"#op3", %"#op2", %"#op1"\n\t"
1083
1084#define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\
1085 a,b,c,d,e,f,g,h,_i)\
1086 RND_STEP_1(a,b,c,d,e,f,g,h,_i)\
1087 VPALIGNR (XTMP0, X3, X2, 4)\
1088 RND_STEP_2(a,b,c,d,e,f,g,h,_i)\
1089 VPADDD (XTMP0, XTMP0, X0)\
1090 RND_STEP_3(a,b,c,d,e,f,g,h,_i)\
1091 VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */\
1092 RND_STEP_4(a,b,c,d,e,f,g,h,_i)\
1093 VPSRLD (XTMP2, XTMP1, 7)\
1094 RND_STEP_5(a,b,c,d,e,f,g,h,_i)\
1095 VPSLLD (XTMP3, XTMP1, 25) /* VPSLLD (XTMP3, XTMP1, (32-7)) */\
1096 RND_STEP_6(a,b,c,d,e,f,g,h,_i)\
1097 VPOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 */\
1098 RND_STEP_7(a,b,c,d,e,f,g,h,_i)\
1099 VPSRLD (XTMP2, XTMP1,18)\
1100 RND_STEP_8(a,b,c,d,e,f,g,h,_i)\
1101\
1102 RND_STEP_1(h,a,b,c,d,e,f,g,_i+1)\
1103 VPSRLD (XTMP4, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */\
1104 RND_STEP_2(h,a,b,c,d,e,f,g,_i+1)\
1105 VPSLLD (XTMP1, XTMP1, 14) /* VPSLLD (XTMP1, XTMP1, (32-18)) */\
1106 RND_STEP_3(h,a,b,c,d,e,f,g,_i+1)\
1107 VPXOR (XTMP3, XTMP3, XTMP1)\
1108 RND_STEP_4(h,a,b,c,d,e,f,g,_i+1)\
1109 VPXOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
1110 RND_STEP_5(h,a,b,c,d,e,f,g,_i+1)\
1111 VPXOR (XTMP1, XTMP3, XTMP4) /* XTMP1 = s0 */\
1112 RND_STEP_6(h,a,b,c,d,e,f,g,_i+1)\
1113 VPSHUFD(XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/\
1114 RND_STEP_7(h,a,b,c,d,e,f,g,_i+1)\
1115 VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */\
1116 RND_STEP_8(h,a,b,c,d,e,f,g,_i+1)\
1117\
1118 RND_STEP_1(g,h,a,b,c,d,e,f,_i+2)\
1119 VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */\
1120 RND_STEP_2(g,h,a,b,c,d,e,f,_i+2)\
1121 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
1122 RND_STEP_3(g,h,a,b,c,d,e,f,_i+2)\
1123 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
1124 RND_STEP_4(g,h,a,b,c,d,e,f,_i+2)\
1125 VPXOR (XTMP2, XTMP2, XTMP3)\
1126 RND_STEP_5(g,h,a,b,c,d,e,f,_i+2)\
1127 VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */\
1128 RND_STEP_6(g,h,a,b,c,d,e,f,_i+2)\
1129 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */\
1130 RND_STEP_7(g,h,a,b,c,d,e,f,_i+2)\
1131 VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */\
1132 RND_STEP_8(g,h,a,b,c,d,e,f,_i+2)\
1133\
1134 RND_STEP_1(f,g,h,a,b,c,d,e,_i+3)\
1135 VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */\
1136 RND_STEP_2(f,g,h,a,b,c,d,e,_i+3)\
1137 VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */\
1138 RND_STEP_3(f,g,h,a,b,c,d,e,_i+3)\
1139 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
1140 RND_STEP_4(f,g,h,a,b,c,d,e,_i+3)\
1141 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
1142 RND_STEP_5(f,g,h,a,b,c,d,e,_i+3)\
1143 VPXOR (XTMP2, XTMP2, XTMP3)\
1144 RND_STEP_6(f,g,h,a,b,c,d,e,_i+3)\
1145 VPXOR (XTMP5, XTMP5, XTMP2) /* XTMP5 = s1 {xDxC} */\
1146 RND_STEP_7(f,g,h,a,b,c,d,e,_i+3)\
1147 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */\
1148 RND_STEP_8(f,g,h,a,b,c,d,e,_i+3)\
1149 VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */\
1150
1151#if defined(HAVE_INTEL_RORX)
1152
1153#define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \
1154 XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\
1155 RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i)\
1156 VPALIGNR (XTMP0, X3, X2, 4)\
1157 RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i)\
1158 VPADDD (XTMP0, XTMP0, X0)\
1159 RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i)\
1160 VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */\
1161 RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i)\
1162 VPSRLD (XTMP2, XTMP1, 7)\
1163 RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i)\
1164 VPSLLD (XTMP3, XTMP1, 25) /* VPSLLD (XTMP3, XTMP1, (32-7)) */\
1165 RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i)\
1166 VPOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 */\
1167 RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i)\
1168 VPSRLD (XTMP2, XTMP1,18)\
1169 RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i)\
1170\
1171 RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1)\
1172 VPSRLD (XTMP4, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */\
1173 RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1)\
1174 VPSLLD (XTMP1, XTMP1, 14) /* VPSLLD (XTMP1, XTMP1, (32-18)) */\
1175 RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1)\
1176 VPXOR (XTMP3, XTMP3, XTMP1)\
1177 RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1)\
1178 VPXOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
1179 RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1)\
1180 VPXOR (XTMP1, XTMP3, XTMP4) /* XTMP1 = s0 */\
1181 RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1)\
1182 VPSHUFD(XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/\
1183 RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1)\
1184 VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */\
1185 RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1)\
1186\
1187 RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2)\
1188 VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */\
1189 RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2)\
1190 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
1191 RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2)\
1192 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
1193 RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2)\
1194 VPXOR (XTMP2, XTMP2, XTMP3)\
1195 RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2)\
1196 VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */\
1197 RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2)\
1198 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */\
1199 RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2)\
1200 VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */\
1201 RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2)\
1202\
1203 RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3)\
1204 VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */\
1205 RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3)\
1206 VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */\
1207 RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3)\
1208 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
1209 RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3)\
1210 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
1211 RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3)\
1212 VPXOR (XTMP2, XTMP2, XTMP3)\
1213 RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3)\
1214 VPXOR (XTMP5, XTMP5, XTMP2) /* XTMP5 = s1 {xDxC} */\
1215 RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3)\
1216 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */\
1217 RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3)\
1218 VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */\
1219
1220#endif /* HAVE_INTEL_RORX */
1221
1222
1223#define W_K_from_buff() \
1224 "leaq %[buf], %%r8\n\t" \
1225 "vmovdqu (%%r8), %%xmm4\n\t" \
1226 "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t" \
1227 "vmovdqu 16(%%r8), %%xmm5\n\t" \
1228 "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t" \
1229 "vmovdqu 32(%%r8), %%xmm6\n\t" \
1230 "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t" \
1231 "vmovdqu 48(%%r8), %%xmm7\n\t" \
1232 "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"
1233
1234#define _SET_W_K_XFER(reg, i) \
1235 "leaq %[K], %%r8\n\t" \
1236 "vpaddd ("#i")*4(%%r8), %"#reg", %%xmm9\n\t" \
1237 "leaq %[W_K], %%r8\n\t" \
1238 "vmovdqa %%xmm9, ("#i")*4(%%r8)\n\t"
1239
1240#define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i)
1241
1242static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */
1243static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */
1244static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b };
1245
1246
1247#define _Init_Masks(mask1, mask2, mask3) \
1248 "vmovdqu %[FLIP], %"#mask1"\n\t" \
1249 "vmovdqu %[SHUF00BA], %"#mask2"\n\t" \
1250 "vmovdqu %[SHUFDC00], %"#mask3"\n\t"
1251
1252#define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\
1253 _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
1254
1255#define X0 %xmm4
1256#define X1 %xmm5
1257#define X2 %xmm6
1258#define X3 %xmm7
1259#define X_ X0
1260
1261#define XTMP0 %xmm0
1262#define XTMP1 %xmm1
1263#define XTMP2 %xmm2
1264#define XTMP3 %xmm3
1265#define XTMP4 %xmm8
1266#define XTMP5 %xmm9
1267#define XFER %xmm10
1268
1269#define SHUF_00BA %xmm11 /* shuffle xBxA -> 00BA */
1270#define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */
1271#define BYTE_FLIP_MASK %xmm13
1272
1273
1274static int Transform_AVX1(wc_Sha256* sha256)
1275{
1276 ALIGN32 word32 W_K[64]; /* temp for W+K */
1277
1278 __asm__ __volatile__ (
1279
1280 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
1281 "# X0, X1, X2, X3 = W[0..15]; \n\t"
1282 W_K_from_buff()
1283
1284 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7)
1285
1286 SET_W_K_XFER(X0, 0)
1287 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1288 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0)
1289 SET_W_K_XFER(X1, 4)
1290 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1291 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4)
1292 SET_W_K_XFER(X2, 8)
1293 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1294 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8)
1295 SET_W_K_XFER(X3, 12)
1296 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1297 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12)
1298 SET_W_K_XFER(X0, 16)
1299 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1300 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16)
1301 SET_W_K_XFER(X1, 20)
1302 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1303 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20)
1304 SET_W_K_XFER(X2, 24)
1305 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1306 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24)
1307 SET_W_K_XFER(X3, 28)
1308 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1309 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28)
1310 SET_W_K_XFER(X0, 32)
1311 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1312 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32)
1313 SET_W_K_XFER(X1, 36)
1314 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1315 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36)
1316 SET_W_K_XFER(X2, 40)
1317 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1318 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40)
1319 SET_W_K_XFER(X3, 44)
1320 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1321 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44)
1322
1323 SET_W_K_XFER(X0, 48)
1324 SET_W_K_XFER(X1, 52)
1325 SET_W_K_XFER(X2, 56)
1326 SET_W_K_XFER(X3, 60)
1327
1328 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48)
1329 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49)
1330 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50)
1331 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51)
1332
1333 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52)
1334 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53)
1335 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54)
1336 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55)
1337
1338 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56)
1339 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57)
1340 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58)
1341 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59)
1342
1343 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60)
1344 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61)
1345 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62)
1346 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63)
1347
1348 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7)
1349
1350 :
1351 : [FLIP] "m" (mBYTE_FLIP_MASK[0]),
1352 [SHUF00BA] "m" (mSHUF_00BA[0]),
1353 [SHUFDC00] "m" (mSHUF_DC00[0]),
1354 [digest] "m" (sha256->digest),
1355 [buf] "m" (sha256->buffer),
1356 [K] "m" (K),
1357 [W_K] "m" (W_K)
1358 : SSE_REGs, "memory"
1359 );
1360
1361 return 0;
1362}
1363
1364#if defined(HAVE_INTEL_RORX)
1365static int Transform_AVX1_RORX(wc_Sha256* sha256)
1366{
1367 ALIGN32 word32 W_K[64]; /* temp for W+K */
1368
1369 __asm__ __volatile__ (
1370
1371 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
1372 "# X0, X1, X2, X3 = W[0..15]; \n\t"
1373 W_K_from_buff()
1374
1375 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7)
1376
1377 SET_W_K_XFER(X0, 0)
1378 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1379 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0)
1380 SET_W_K_XFER(X1, 4)
1381 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1382 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4)
1383 SET_W_K_XFER(X2, 8)
1384 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1385 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8)
1386 SET_W_K_XFER(X3, 12)
1387 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1388 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12)
1389 SET_W_K_XFER(X0, 16)
1390 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1391 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16)
1392 SET_W_K_XFER(X1, 20)
1393 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1394 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20)
1395 SET_W_K_XFER(X2, 24)
1396 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1397 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24)
1398 SET_W_K_XFER(X3, 28)
1399 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1400 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28)
1401 SET_W_K_XFER(X0, 32)
1402 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1403 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32)
1404 SET_W_K_XFER(X1, 36)
1405 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1406 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36)
1407 SET_W_K_XFER(X2, 40)
1408 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1409 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40)
1410 SET_W_K_XFER(X3, 44)
1411 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1412 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44)
1413
1414 SET_W_K_XFER(X0, 48)
1415 SET_W_K_XFER(X1, 52)
1416 SET_W_K_XFER(X2, 56)
1417 SET_W_K_XFER(X3, 60)
1418
1419 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48)
1420 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49)
1421 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50)
1422 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51)
1423
1424 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52)
1425 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53)
1426 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54)
1427 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55)
1428
1429 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56)
1430 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57)
1431 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58)
1432 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59)
1433
1434 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60)
1435 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61)
1436 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62)
1437 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63)
1438
1439 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7)
1440
1441 :
1442 : [FLIP] "m" (mBYTE_FLIP_MASK[0]),
1443 [SHUF00BA] "m" (mSHUF_00BA[0]),
1444 [SHUFDC00] "m" (mSHUF_DC00[0]),
1445 [digest] "m" (sha256->digest),
1446 [buf] "m" (sha256->buffer),
1447 [K] "m" (K),
1448 [W_K] "m" (W_K)
1449 : SSE_REGs, "memory"
1450 );
1451
1452 return 0;
1453}
1454#endif /* HAVE_INTEL_RORX */
1455#endif /* HAVE_INTEL_AVX1 */
1456
1457
1458#if defined(HAVE_INTEL_AVX2)
1459
1460#define _MOVE_to_REG(ymm, mem, i) \
1461 "leaq %["#mem"], %%r8\n\t" \
1462 "vmovdqu ("#i")*4(%%r8), %%"#ymm"\n\t"
1463#define _MOVE_to_MEM(mem, i, ymm) \
1464 "leaq %["#mem"], %%r8\n\t" \
1465 "vmovdqu %%"#ymm", "#i"*4(%%r8)\n\t"
1466#define _BYTE_SWAP(ymm, map) \
1467 "vpshufb %["#map"], %%"#ymm", %%"#ymm"\n\t"
1468#define _MOVE_128(ymm0, ymm1, ymm2, map) \
1469 "vperm2i128 $"#map", %%"#ymm2", %%"#ymm1", %%"#ymm0"\n\t"
1470#define _MOVE_BYTE(ymm0, ymm1, map) \
1471 "vpshufb %["#map"], %%"#ymm1", %%"#ymm0"\n\t"
1472#define _S_TEMP(dest, src, bits, temp) \
1473 "vpsrld $"#bits", %%"#src", %%"#dest"\n\t" \
1474 "vpslld $32-"#bits", %%"#src", %%"#temp"\n\t" \
1475 "vpor %%"#temp",%%"#dest", %%"#dest"\n\t"
1476#define _AVX2_R(dest, src, bits) \
1477 "vpsrld $"#bits", %%"#src", %%"#dest"\n\t"
1478#define _XOR(dest, src1, src2) \
1479 "vpxor %%"#src1", %%"#src2", %%"#dest"\n\t"
1480#define _OR(dest, src1, src2) \
1481 "vpor %%"#src1", %%"#src2", %%"#dest"\n\t"
1482#define _ADD(dest, src1, src2) \
1483 "vpaddd %%"#src1", %%"#src2", %%"#dest"\n\t"
1484#define _ADD_MEM(dest, src1, mem, i) \
1485 "leaq %["#mem"], %%r8\n\t" \
1486 "vpaddd "#i"*4(%%r8), %%"#src1", %%"#dest"\n\t"
1487#define _BLEND(map, dest, src1, src2) \
1488 "vpblendd $"#map", %%"#src1", %%"#src2", %%"#dest"\n\t"
1489
1490#define _EXTRACT_XMM_0(xmm, mem) \
1491 "vpextrd $0, %%"#xmm", %["#mem"]\n\t"
1492#define _EXTRACT_XMM_1(xmm, mem) \
1493 "vpextrd $1, %%"#xmm", %["#mem"]\n\t"
1494#define _EXTRACT_XMM_2(xmm, mem) \
1495 "vpextrd $2, %%"#xmm", %["#mem"]\n\t"
1496#define _EXTRACT_XMM_3(xmm, mem) \
1497 "vpextrd $3, %%"#xmm", %["#mem"]\n\t"
1498#define _EXTRACT_XMM_4(ymm, xmm, mem) \
1499 "vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm"\n\t" \
1500 "vpextrd $0, %%"#xmm", %["#mem"]\n\t"
1501#define _EXTRACT_XMM_5(xmm, mem) \
1502 "vpextrd $1, %%"#xmm", %["#mem"]\n\t"
1503#define _EXTRACT_XMM_6(xmm, mem) \
1504 "vpextrd $2, %%"#xmm", %["#mem"]\n\t"
1505#define _EXTRACT_XMM_7(xmm, mem) \
1506 "vpextrd $3, %%"#xmm", %["#mem"]\n\t"
1507
1508#define _SWAP_YMM_HL(ymm) \
1509 "vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm"\n\t"
1510#define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm)
1511
1512#define MOVE_to_REG(ymm, mem, i) _MOVE_to_REG(ymm, mem, i)
1513#define MOVE_to_MEM(mem, i, ymm) _MOVE_to_MEM(mem, i, ymm)
1514#define BYTE_SWAP(ymm, map) _BYTE_SWAP(ymm, map)
1515#define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map)
1516#define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map)
1517#define XOR(dest, src1, src2) _XOR(dest, src1, src2)
1518#define OR(dest, src1, src2) _OR(dest, src1, src2)
1519#define ADD(dest, src1, src2) _ADD(dest, src1, src2)
1520#define ADD_MEM(dest, src1, mem, i) _ADD_MEM(dest, src1, mem, i)
1521#define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2)
1522
1523#define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp)
1524#define AVX2_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP)
1525#define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits)
1526
1527#define GAMMA0(dest, src) AVX2_S(dest, src, 7) AVX2_S(G_TEMP, src, 18) \
1528 XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 3) XOR(dest, G_TEMP, dest)
1529#define GAMMA0_1(dest, src) AVX2_S(dest, src, 7) AVX2_S(G_TEMP, src, 18)
1530#define GAMMA0_2(dest, src) XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 3) \
1531 XOR(dest, G_TEMP, dest)
1532
1533#define GAMMA1(dest, src) AVX2_S(dest, src, 17) AVX2_S(G_TEMP, src, 19) \
1534 XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 10) XOR(dest, G_TEMP, dest)
1535#define GAMMA1_1(dest, src) AVX2_S(dest, src, 17) AVX2_S(G_TEMP, src, 19)
1536#define GAMMA1_2(dest, src) XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 10) \
1537 XOR(dest, G_TEMP, dest)
1538
1539#define FEEDBACK1_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, MAP1W_2) \
1540 BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2)
1541#define FEEDBACK2_to_W_I_2 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) \
1542 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, MAP2W_2) BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2)
1543#define FEEDBACK3_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, MAP3W_2) \
1544 BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2)
1545
1546#define FEEDBACK_to_W_I_7 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08)\
1547 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, MAPW_7) BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7)
1548
1549#undef voitle
1550
1551#define W_I_16 ymm8
1552#define W_I_15 ymm9
1553#define W_I_7 ymm10
1554#define W_I_2 ymm11
1555#define W_I ymm12
1556#define G_TEMP ymm13
1557#define S_TEMP ymm14
1558#define YMM_TEMP0 ymm15
1559#define YMM_TEMP0x xmm15
1560#define W_I_TEMP ymm7
1561#define W_K_TEMP ymm15
1562#define W_K_TEMPx xmm15
1563
1564#define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\
1565 "vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15"\n\t" \
1566 "vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16"\n\t" \
1567 "vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15"\n\t" \
1568 "vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16"\n\t" \
1569 "vpshufd $0x93, %%"#w_i_16", %%"#w_i_16"\n\t"
1570
1571#define MOVE_7_to_15(w_i_15, w_i_7)\
1572 "vmovdqu %%"#w_i_7", %%"#w_i_15"\n\t"
1573
1574#define MOVE_I_to_7(w_i_7, w_i)\
1575 "vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7"\n\t" \
1576 "vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7"\n\t" \
1577 "vpshufd $0x39, %%"#w_i_7", %%"#w_i_7"\n\t"
1578
1579#define MOVE_I_to_2(w_i_2, w_i)\
1580 "vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2"\n\t" \
1581 "vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2"\n\t"
1582
1583#define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\
1584 MOVE_15_to_16(w_i_16, w_i_15, w_i_7) \
1585 MOVE_7_to_15(w_i_15, w_i_7) \
1586 MOVE_I_to_7(w_i_7, w_i) \
1587 MOVE_I_to_2(w_i_2, w_i)
1588
1589#define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
1590 { word32 d[8];\
1591 __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs);\
1592 __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs);\
1593 __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs);\
1594 __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs);\
1595 __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs);\
1596 __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs);\
1597 __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs);\
1598 __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs);\
1599 printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\
1600 __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs);\
1601 __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs);\
1602 __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs);\
1603 __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs);\
1604 __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs);\
1605 __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs);\
1606 __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs);\
1607 __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs);\
1608}
1609
1610
1611#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
1612 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
1613
1614#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
1615 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
1616
1617#define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
1618 _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
1619
1620
1621 /* Byte swap Masks to ensure that rest of the words are filled with zero's. */
1622 static const unsigned long mBYTE_FLIP_MASK_16[] =
1623 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b };
1624 static const unsigned long mBYTE_FLIP_MASK_15[] =
1625 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b };
1626 static const unsigned long mBYTE_FLIP_MASK_7 [] =
1627 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b };
1628 static const unsigned long mBYTE_FLIP_MASK_2 [] =
1629 { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 };
1630
1631 static const unsigned long mMAPtoW_I_7[] =
1632 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 };
1633 static const unsigned long mMAP1toW_I_2[] =
1634 { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 };
1635 static const unsigned long mMAP2toW_I_2[] =
1636 { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 };
1637 static const unsigned long mMAP3toW_I_2[] =
1638 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 };
1639
1640static int Transform_AVX2(wc_Sha256* sha256)
1641{
1642#ifdef WOLFSSL_SMALL_STACK
1643 word32* W_K;
1644 W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
1645 if (W_K == NULL)
1646 return MEMORY_E;
1647#else
1648 word32 W_K[64];
1649#endif
1650
1651 __asm__ __volatile__ (
1652
1653 MOVE_to_REG(W_I_16, buf, 0) BYTE_SWAP(W_I_16, FLIP_16)
1654 MOVE_to_REG(W_I_15, buf, 1) BYTE_SWAP(W_I_15, FLIP_15)
1655 MOVE_to_REG(W_I, buf, 8) BYTE_SWAP(W_I, FLIP_16)
1656 MOVE_to_REG(W_I_7, buf, 16-7) BYTE_SWAP(W_I_7, FLIP_7)
1657 MOVE_to_REG(W_I_2, buf, 16-2) BYTE_SWAP(W_I_2, FLIP_2)
1658
1659 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7)
1660
1661 ADD_MEM(W_K_TEMP, W_I_16, K, 0)
1662 MOVE_to_MEM(W_K, 0, W_K_TEMP)
1663
1664 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0)
1665 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1)
1666 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2)
1667 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3)
1668 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4)
1669 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5)
1670 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6)
1671 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7)
1672
1673 ADD_MEM(YMM_TEMP0, W_I, K, 8)
1674 MOVE_to_MEM(W_K, 8, YMM_TEMP0)
1675
1676 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1677 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8)
1678 GAMMA0_1(W_I_TEMP, W_I_15)
1679 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8)
1680 GAMMA0_2(W_I_TEMP, W_I_15)
1681 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8)
1682 ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */
1683 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9)
1684 ADD(W_I, W_I_7, W_I_TEMP)
1685 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9)
1686 GAMMA1_1(YMM_TEMP0, W_I_2)
1687 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9)
1688 GAMMA1_2(YMM_TEMP0, W_I_2)
1689 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10)
1690 ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */
1691 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10)
1692 FEEDBACK1_to_W_I_2
1693 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10)
1694 FEEDBACK_to_W_I_7
1695 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11)
1696 ADD(W_I_TEMP, W_I_7, W_I_TEMP)
1697 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11)
1698 GAMMA1_1(YMM_TEMP0, W_I_2)
1699 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11)
1700 GAMMA1_2(YMM_TEMP0, W_I_2)
1701 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12)
1702 ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */
1703 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12)
1704 FEEDBACK2_to_W_I_2
1705 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12)
1706 GAMMA1_1(YMM_TEMP0, W_I_2)
1707 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13)
1708 GAMMA1_2(YMM_TEMP0, W_I_2)
1709 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13)
1710 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */
1711 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13)
1712 FEEDBACK3_to_W_I_2
1713 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14)
1714 GAMMA1(YMM_TEMP0, W_I_2)
1715 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14)
1716 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14)
1717 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */
1718 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15)
1719
1720 MOVE_to_REG(YMM_TEMP0, K, 16)
1721 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15)
1722 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I)
1723 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15)
1724 ADD(YMM_TEMP0, YMM_TEMP0, W_I)
1725 MOVE_to_MEM(W_K, 16, YMM_TEMP0)
1726
1727 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1728 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16)
1729 GAMMA0_1(W_I_TEMP, W_I_15)
1730 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16)
1731 GAMMA0_2(W_I_TEMP, W_I_15)
1732 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16)
1733 ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */
1734 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17)
1735 ADD(W_I, W_I_7, W_I_TEMP)
1736 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17)
1737 GAMMA1_1(YMM_TEMP0, W_I_2)
1738 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17)
1739 GAMMA1_2(YMM_TEMP0, W_I_2)
1740 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18)
1741 ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */
1742 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18)
1743 FEEDBACK1_to_W_I_2
1744 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18)
1745 FEEDBACK_to_W_I_7
1746 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19)
1747 ADD(W_I_TEMP, W_I_7, W_I_TEMP)
1748 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19)
1749 GAMMA1(YMM_TEMP0, W_I_2)
1750 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19)
1751 GAMMA1_2(YMM_TEMP0, W_I_2)
1752 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20)
1753 ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */
1754 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20)
1755 FEEDBACK2_to_W_I_2
1756 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20)
1757 GAMMA1_1(YMM_TEMP0, W_I_2)
1758 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21)
1759 GAMMA1_2(YMM_TEMP0, W_I_2)
1760 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21)
1761 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */
1762 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21)
1763 FEEDBACK3_to_W_I_2
1764 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22)
1765 GAMMA1_1(YMM_TEMP0, W_I_2)
1766 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22)
1767 GAMMA1_2(YMM_TEMP0, W_I_2)
1768 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22)
1769 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */
1770 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23)
1771
1772 MOVE_to_REG(YMM_TEMP0, K, 24)
1773 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23)
1774 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I)
1775 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23)
1776 ADD(YMM_TEMP0, YMM_TEMP0, W_I)
1777 MOVE_to_MEM(W_K, 24, YMM_TEMP0)
1778
1779 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1780 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24)
1781 GAMMA0_1(W_I_TEMP, W_I_15)
1782 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24)
1783 GAMMA0_2(W_I_TEMP, W_I_15)
1784 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24)
1785 ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */
1786 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25)
1787 ADD(W_I, W_I_7, W_I_TEMP)
1788 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25)
1789 GAMMA1_1(YMM_TEMP0, W_I_2)
1790 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25)
1791 GAMMA1_2(YMM_TEMP0, W_I_2)
1792 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26)
1793 ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */
1794 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26)
1795 FEEDBACK1_to_W_I_2
1796 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26)
1797 FEEDBACK_to_W_I_7
1798 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27)
1799 ADD(W_I_TEMP, W_I_7, W_I_TEMP)
1800 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27)
1801 GAMMA1_1(YMM_TEMP0, W_I_2)
1802 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27)
1803 GAMMA1_2(YMM_TEMP0, W_I_2)
1804 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28)
1805 ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */
1806 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28)
1807 FEEDBACK2_to_W_I_2
1808 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28)
1809 GAMMA1_1(YMM_TEMP0, W_I_2)
1810 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29)
1811 GAMMA1_2(YMM_TEMP0, W_I_2)
1812 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29)
1813 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */
1814 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29)
1815 FEEDBACK3_to_W_I_2
1816 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30)
1817 GAMMA1(YMM_TEMP0, W_I_2)
1818 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30)
1819 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30)
1820 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */
1821 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31)
1822
1823 MOVE_to_REG(YMM_TEMP0, K, 32)
1824 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31)
1825 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I)
1826 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31)
1827 ADD(YMM_TEMP0, YMM_TEMP0, W_I)
1828 MOVE_to_MEM(W_K, 32, YMM_TEMP0)
1829
1830
1831 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1832 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32)
1833 GAMMA0_1(W_I_TEMP, W_I_15)
1834 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32)
1835 GAMMA0_2(W_I_TEMP, W_I_15)
1836 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32)
1837 ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */
1838 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33)
1839 ADD(W_I, W_I_7, W_I_TEMP)
1840 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33)
1841 GAMMA1_1(YMM_TEMP0, W_I_2)
1842 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33)
1843 GAMMA1_2(YMM_TEMP0, W_I_2)
1844 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34)
1845 ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */
1846 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34)
1847 FEEDBACK1_to_W_I_2
1848 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34)
1849 FEEDBACK_to_W_I_7
1850 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35)
1851 ADD(W_I_TEMP, W_I_7, W_I_TEMP)
1852 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35)
1853 GAMMA1_1(YMM_TEMP0, W_I_2)
1854 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35)
1855 GAMMA1_2(YMM_TEMP0, W_I_2)
1856 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36)
1857 ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */
1858 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36)
1859 FEEDBACK2_to_W_I_2
1860 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36)
1861 GAMMA1_1(YMM_TEMP0, W_I_2)
1862 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37)
1863 GAMMA1_2(YMM_TEMP0, W_I_2)
1864 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37)
1865 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */
1866 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37)
1867 FEEDBACK3_to_W_I_2
1868 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38)
1869 GAMMA1_1(YMM_TEMP0, W_I_2)
1870 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38)
1871 GAMMA1_2(YMM_TEMP0, W_I_2)
1872 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38)
1873 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */
1874 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39)
1875
1876 MOVE_to_REG(YMM_TEMP0, K, 40)
1877 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39)
1878 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I)
1879 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39)
1880 ADD(YMM_TEMP0, YMM_TEMP0, W_I)
1881 MOVE_to_MEM(W_K, 40, YMM_TEMP0)
1882
1883 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1884 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40)
1885 GAMMA0_1(W_I_TEMP, W_I_15)
1886 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40)
1887 GAMMA0_2(W_I_TEMP, W_I_15)
1888 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40)
1889 ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */
1890 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41)
1891 ADD(W_I, W_I_7, W_I_TEMP)
1892 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41)
1893 GAMMA1_1(YMM_TEMP0, W_I_2)
1894 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41)
1895 GAMMA1_2(YMM_TEMP0, W_I_2)
1896 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42)
1897 ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */
1898 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42)
1899 FEEDBACK1_to_W_I_2
1900 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42)
1901 FEEDBACK_to_W_I_7
1902 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43)
1903 ADD(W_I_TEMP, W_I_7, W_I_TEMP)
1904 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43)
1905 GAMMA1_1(YMM_TEMP0, W_I_2)
1906 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43)
1907 GAMMA1_2(YMM_TEMP0, W_I_2)
1908 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44)
1909 ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */
1910 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44)
1911 FEEDBACK2_to_W_I_2
1912 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44)
1913 GAMMA1_1(YMM_TEMP0, W_I_2)
1914 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45)
1915 GAMMA1_2(YMM_TEMP0, W_I_2)
1916 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45)
1917 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */
1918 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45)
1919 FEEDBACK3_to_W_I_2
1920 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46)
1921 GAMMA1_1(YMM_TEMP0, W_I_2)
1922 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46)
1923 GAMMA1_2(YMM_TEMP0, W_I_2)
1924 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46)
1925 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */
1926 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47)
1927
1928 MOVE_to_REG(YMM_TEMP0, K, 48)
1929 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47)
1930 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I)
1931 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47)
1932 ADD(YMM_TEMP0, YMM_TEMP0, W_I)
1933 MOVE_to_MEM(W_K, 48, YMM_TEMP0)
1934
1935 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1936 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48)
1937 GAMMA0_1(W_I_TEMP, W_I_15)
1938 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48)
1939 GAMMA0_2(W_I_TEMP, W_I_15)
1940 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48)
1941 ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */
1942 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49)
1943 ADD(W_I, W_I_7, W_I_TEMP)
1944 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49)
1945 GAMMA1_1(YMM_TEMP0, W_I_2)
1946 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49)
1947 GAMMA1_2(YMM_TEMP0, W_I_2)
1948 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50)
1949 ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */
1950 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50)
1951 FEEDBACK1_to_W_I_2
1952 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50)
1953 FEEDBACK_to_W_I_7
1954 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51)
1955 ADD(W_I_TEMP, W_I_7, W_I_TEMP)
1956 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51)
1957 GAMMA1_1(YMM_TEMP0, W_I_2)
1958 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51)
1959 GAMMA1_2(YMM_TEMP0, W_I_2)
1960 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52)
1961 ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */
1962 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52)
1963 FEEDBACK2_to_W_I_2
1964 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52)
1965 GAMMA1_1(YMM_TEMP0, W_I_2)
1966 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53)
1967 GAMMA1_2(YMM_TEMP0, W_I_2)
1968 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53)
1969 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */
1970 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53)
1971 FEEDBACK3_to_W_I_2
1972 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54)
1973 GAMMA1_1(YMM_TEMP0, W_I_2)
1974 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54)
1975 GAMMA1_2(YMM_TEMP0, W_I_2)
1976 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54)
1977 ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */
1978 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55)
1979
1980 MOVE_to_REG(YMM_TEMP0, K, 56)
1981 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55)
1982 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I)
1983 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55)
1984 ADD(YMM_TEMP0, YMM_TEMP0, W_I)
1985 MOVE_to_MEM(W_K, 56, YMM_TEMP0)
1986
1987 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56)
1988 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57)
1989 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58)
1990 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59)
1991
1992 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60)
1993 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61)
1994 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62)
1995 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63)
1996
1997 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7)
1998
1999 :
2000 : [FLIP_16] "m" (mBYTE_FLIP_MASK_16[0]),
2001 [FLIP_15] "m" (mBYTE_FLIP_MASK_15[0]),
2002 [FLIP_7] "m" (mBYTE_FLIP_MASK_7[0]),
2003 [FLIP_2] "m" (mBYTE_FLIP_MASK_2),
2004 [MAPW_7] "m" (mMAPtoW_I_7[0]),
2005 [MAP1W_2] "m" (mMAP1toW_I_2[0]),
2006 [MAP2W_2] "m" (mMAP2toW_I_2[0]),
2007 [MAP3W_2] "m" (mMAP3toW_I_2[0]),
2008 [digest] "m" (sha256->digest),
2009 [buf] "m" (sha256->buffer),
2010 [K] "m" (K),
2011 [W_K] "m" (W_K)
2012 : SSE_REGs, "memory"
2013 );
2014
2015#ifdef WOLFSSL_SMALL_STACK
2016 XFREE(W_K, NULL, DYNAMIC_TYPE_TMP_BUFFER);
2017#endif
2018
2019 return 0;
2020}
2021
2022#endif /* HAVE_INTEL_AVX2 */
2023
2024
2025#ifdef WOLFSSL_SHA224
2026
2027#ifdef STM32_HASH
2028
2029 #define Sha256Update Sha224Update
2030 #define Sha256Final Sha224Final
2031
2032 /*
2033 * STM32F2/F4/F7 hardware SHA224 support through the HASH_* API's from the
2034 * Standard Peripheral Library or CubeMX (See note in README).
2035 */
2036
2037 /* STM32 register size, bytes */
2038 #ifdef WOLFSSL_STM32_CUBEMX
2039 #define SHA224_REG_SIZE WC_SHA224_BLOCK_SIZE
2040 #else
2041 #define SHA224_REG_SIZE 4
2042 /* STM32 struct notes:
2043 * sha224->buffer = first 4 bytes used to hold partial block if needed
2044 * sha224->buffLen = num bytes currently stored in sha256->buffer
2045 * sha224->loLen = num bytes that have been written to STM32 FIFO
2046 */
2047 #endif
2048 #define SHA224_HW_TIMEOUT 0xFF
2049
2050 static int InitSha224(wc_Sha224* sha224)
2051 {
2052 if (sha224 == NULL)
2053 return BAD_FUNC_ARG;
2054
2055 XMEMSET(sha224->buffer, 0, sizeof(sha224->buffer));
2056 sha224->buffLen = 0;
2057 sha224->loLen = 0;
2058 sha224->hiLen = 0;
2059
2060 /* initialize HASH peripheral */
2061 #ifdef WOLFSSL_STM32_CUBEMX
2062 HAL_HASH_DeInit(&sha224->hashHandle);
2063 sha224->hashHandle.Init.DataType = HASH_DATATYPE_8B;
2064 if (HAL_HASH_Init(&sha224->hashHandle) != HAL_OK) {
2065 return ASYNC_INIT_E;
2066 }
2067 /* required because Cube MX is not clearing algo bits */
2068 HASH->CR &= ~HASH_CR_ALGO;
2069 #else
2070 HASH_DeInit();
2071
2072 /* reset the hash control register */
2073 /* required because Cube MX is not clearing algo bits */
2074 HASH->CR &= ~ (HASH_CR_ALGO | HASH_CR_DATATYPE | HASH_CR_MODE);
2075
2076 /* configure algo used, algo mode, datatype */
2077 HASH->CR |= (HASH_AlgoSelection_SHA224 | HASH_AlgoMode_HASH
2078 | HASH_DataType_8b);
2079
2080 /* reset HASH processor */
2081 HASH->CR |= HASH_CR_INIT;
2082 #endif
2083
2084 return 0;
2085 }
2086
2087 static int Sha224Update(wc_Sha256* sha224, const byte* data, word32 len)
2088 {
2089 int ret = 0;
2090 byte* local;
2091
2092 /* do block size increments */
2093 local = (byte*)sha224->buffer;
2094
2095 /* check that internal buffLen is valid */
2096 if (sha224->buffLen >= SHA224_REG_SIZE)
2097 return BUFFER_E;
2098
2099 while (len) {
2100 word32 add = min(len, SHA224_REG_SIZE - sha224->buffLen);
2101 XMEMCPY(&local[sha224->buffLen], data, add);
2102
2103 sha224->buffLen += add;
2104 data += add;
2105 len -= add;
2106
2107 if (sha224->buffLen == SHA224_REG_SIZE) {
2108 #ifdef WOLFSSL_STM32_CUBEMX
2109 if (HAL_HASHEx_SHA224_Accumulate(
2110 &sha224->hashHandle, local, SHA224_REG_SIZE) != HAL_OK) {
2111 ret = ASYNC_OP_E;
2112 }
2113 #else
2114 HASH_DataIn(*(uint32_t*)local);
2115 #endif
2116
2117 AddLength(sha224, SHA224_REG_SIZE);
2118 sha224->buffLen = 0;
2119 }
2120 }
2121 return ret;
2122 }
2123
2124 static int Sha224Final(wc_Sha256* sha224)
2125 {
2126 int ret = 0;
2127
2128 #ifdef WOLFSSL_STM32_CUBEMX
2129 if (HAL_HASHEx_SHA224_Start(&sha224->hashHandle,
2130 (byte*)sha224->buffer, sha224->buffLen,
2131 (byte*)sha224->digest, SHA224_HW_TIMEOUT) != HAL_OK) {
2132 ret = ASYNC_OP_E;
2133 }
2134 #else
2135 __IO uint16_t nbvalidbitsdata = 0;
2136
2137 /* finish reading any trailing bytes into FIFO */
2138 if (sha224->buffLen > 0) {
2139 HASH_DataIn(*(uint32_t*)sha224->buffer);
2140 AddLength(sha224, sha224->buffLen);
2141 }
2142
2143 /* calculate number of valid bits in last word of input data */
2144 nbvalidbitsdata = 8 * (sha224->loLen % SHA224_REG_SIZE);
2145
2146 /* configure number of valid bits in last word of the data */
2147 HASH_SetLastWordValidBitsNbr(nbvalidbitsdata);
2148
2149 /* start HASH processor */
2150 HASH_StartDigest();
2151
2152 /* wait until Busy flag == RESET */
2153 while (HASH_GetFlagStatus(HASH_FLAG_BUSY) != RESET) {}
2154
2155 /* read message digest */
2156 sha224->digest[0] = HASH->HR[0];
2157 sha224->digest[1] = HASH->HR[1];
2158 sha224->digest[2] = HASH->HR[2];
2159 sha224->digest[3] = HASH->HR[3];
2160 sha224->digest[4] = HASH->HR[4];
2161 sha224->digest[5] = HASH_DIGEST->HR[5];
2162 sha224->digest[6] = HASH_DIGEST->HR[6];
2163
2164 ByteReverseWords(sha224->digest, sha224->digest, SHA224_DIGEST_SIZE);
2165 #endif /* WOLFSSL_STM32_CUBEMX */
2166
2167 return ret;
2168 }
2169
2170#else
2171
2172 static int InitSha224(wc_Sha224* sha224)
2173 {
2174 int ret = 0;
2175
2176 if (sha224 == NULL) {
2177 return BAD_FUNC_ARG;
2178 }
2179
2180 sha224->digest[0] = 0xc1059ed8;
2181 sha224->digest[1] = 0x367cd507;
2182 sha224->digest[2] = 0x3070dd17;
2183 sha224->digest[3] = 0xf70e5939;
2184 sha224->digest[4] = 0xffc00b31;
2185 sha224->digest[5] = 0x68581511;
2186 sha224->digest[6] = 0x64f98fa7;
2187 sha224->digest[7] = 0xbefa4fa4;
2188
2189 sha224->buffLen = 0;
2190 sha224->loLen = 0;
2191 sha224->hiLen = 0;
2192
2193 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
2194 /* choose best Transform function under this runtime environment */
2195 Sha256_SetTransform();
2196 #endif
2197
2198 return ret;
2199 }
2200
2201#endif /* STM32_HASH */
2202
2203 int wc_InitSha224_ex(wc_Sha224* sha224, void* heap, int devId)
2204 {
2205 int ret = 0;
2206
2207 if (sha224 == NULL)
2208 return BAD_FUNC_ARG;
2209
2210 sha224->heap = heap;
2211
2212 ret = InitSha224(sha224);
2213 if (ret != 0)
2214 return ret;
2215
2216 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
2217 ret = wolfAsync_DevCtxInit(&sha224->asyncDev,
2218 WOLFSSL_ASYNC_MARKER_SHA224, sha224->heap, devId);
2219 #else
2220 (void)devId;
2221 #endif /* WOLFSSL_ASYNC_CRYPT */
2222
2223 return ret;
2224 }
2225
2226 int wc_InitSha224(wc_Sha224* sha224)
2227 {
2228 return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID);
2229 }
2230
2231 int wc_Sha224Update(wc_Sha224* sha224, const byte* data, word32 len)
2232 {
2233 int ret;
2234
2235 if (sha224 == NULL || (data == NULL && len > 0)) {
2236 return BAD_FUNC_ARG;
2237 }
2238
2239 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
2240 if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) {
2241 #if defined(HAVE_INTEL_QA)
2242 return IntelQaSymSha224(&sha224->asyncDev, NULL, data, len);
2243 #endif
2244 }
2245 #endif /* WOLFSSL_ASYNC_CRYPT */
2246
2247 ret = Sha256Update((wc_Sha256*)sha224, data, len);
2248
2249 return ret;
2250 }
2251
2252 int wc_Sha224Final(wc_Sha224* sha224, byte* hash)
2253 {
2254 int ret;
2255
2256 if (sha224 == NULL || hash == NULL) {
2257 return BAD_FUNC_ARG;
2258 }
2259
2260 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
2261 if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) {
2262 #if defined(HAVE_INTEL_QA)
2263 return IntelQaSymSha224(&sha224->asyncDev, hash, NULL,
2264 WC_SHA224_DIGEST_SIZE);
2265 #endif
2266 }
2267 #endif /* WOLFSSL_ASYNC_CRYPT */
2268
2269 ret = Sha256Final((wc_Sha256*)sha224);
2270 if (ret != 0)
2271 return ret;
2272
2273 #if defined(LITTLE_ENDIAN_ORDER) && !defined(STM32_HASH)
2274 ByteReverseWords(sha224->digest, sha224->digest, WC_SHA224_DIGEST_SIZE);
2275 #endif
2276 XMEMCPY(hash, sha224->digest, WC_SHA224_DIGEST_SIZE);
2277
2278 return InitSha224(sha224); /* reset state */
2279 }
2280
2281 void wc_Sha224Free(wc_Sha224* sha224)
2282 {
2283 if (sha224 == NULL)
2284 return;
2285
2286 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
2287 wolfAsync_DevCtxFree(&sha224->asyncDev, WOLFSSL_ASYNC_MARKER_SHA224);
2288 #endif /* WOLFSSL_ASYNC_CRYPT */
2289 }
2290
2291#endif /* WOLFSSL_SHA224 */
2292
2293
2294int wc_InitSha256(wc_Sha256* sha256)
2295{
2296 return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID);
2297}
2298
2299void wc_Sha256Free(wc_Sha256* sha256)
2300{
2301 if (sha256 == NULL)
2302 return;
2303
2304#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
2305 wolfAsync_DevCtxFree(&sha256->asyncDev, WOLFSSL_ASYNC_MARKER_SHA256);
2306#endif /* WOLFSSL_ASYNC_CRYPT */
2307}
2308
2309#endif /* !WOLFSSL_TI_HASH */
2310#endif /* HAVE_FIPS */
2311
2312
2313#ifndef WOLFSSL_TI_HASH
2314#ifdef WOLFSSL_SHA224
2315 int wc_Sha224GetHash(wc_Sha224* sha224, byte* hash)
2316 {
2317 int ret;
2318 wc_Sha224 tmpSha224;
2319
2320 if (sha224 == NULL || hash == NULL)
2321 return BAD_FUNC_ARG;
2322
2323 ret = wc_Sha224Copy(sha224, &tmpSha224);
2324 if (ret == 0) {
2325 ret = wc_Sha224Final(&tmpSha224, hash);
2326 }
2327 return ret;
2328 }
2329 int wc_Sha224Copy(wc_Sha224* src, wc_Sha224* dst)
2330 {
2331 int ret = 0;
2332
2333 if (src == NULL || dst == NULL)
2334 return BAD_FUNC_ARG;
2335
2336 XMEMCPY(dst, src, sizeof(wc_Sha224));
2337
2338 #ifdef WOLFSSL_ASYNC_CRYPT
2339 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
2340 #endif
2341
2342 return ret;
2343 }
2344#endif /* WOLFSSL_SHA224 */
2345
2346int wc_Sha256GetHash(wc_Sha256* sha256, byte* hash)
2347{
2348 int ret;
2349 wc_Sha256 tmpSha256;
2350
2351 if (sha256 == NULL || hash == NULL)
2352 return BAD_FUNC_ARG;
2353
2354 ret = wc_Sha256Copy(sha256, &tmpSha256);
2355 if (ret == 0) {
2356 ret = wc_Sha256Final(&tmpSha256, hash);
2357 }
2358 return ret;
2359}
2360int wc_Sha256Copy(wc_Sha256* src, wc_Sha256* dst)
2361{
2362 int ret = 0;
2363
2364 if (src == NULL || dst == NULL)
2365 return BAD_FUNC_ARG;
2366
2367 XMEMCPY(dst, src, sizeof(wc_Sha256));
2368
2369#ifdef WOLFSSL_ASYNC_CRYPT
2370 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
2371#endif
2372#ifdef WOLFSSL_PIC32MZ_HASH
2373 ret = wc_Pic32HashCopy(&src->cache, &dst->cache);
2374#endif
2375
2376 return ret;
2377}
2378#endif /* !WOLFSSL_TI_HASH */
2379
2380#endif /* NO_SHA256 */
Note: See TracBrowser for help on using the repository browser.