[337] | 1 | /* sha512.c
|
---|
| 2 | *
|
---|
| 3 | * Copyright (C) 2006-2017 wolfSSL Inc.
|
---|
| 4 | *
|
---|
| 5 | * This file is part of wolfSSL.
|
---|
| 6 | *
|
---|
| 7 | * wolfSSL is free software; you can redistribute it and/or modify
|
---|
| 8 | * it under the terms of the GNU General Public License as published by
|
---|
| 9 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 10 | * (at your option) any later version.
|
---|
| 11 | *
|
---|
| 12 | * wolfSSL is distributed in the hope that it will be useful,
|
---|
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 15 | * GNU General Public License for more details.
|
---|
| 16 | *
|
---|
| 17 | * You should have received a copy of the GNU General Public License
|
---|
| 18 | * along with this program; if not, write to the Free Software
|
---|
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
|
---|
| 20 | */
|
---|
| 21 |
|
---|
| 22 |
|
---|
| 23 | #ifdef HAVE_CONFIG_H
|
---|
| 24 | #include <config.h>
|
---|
| 25 | #endif
|
---|
| 26 |
|
---|
| 27 | #include <wolfssl/wolfcrypt/settings.h>
|
---|
| 28 |
|
---|
[372] | 29 | #if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
|
---|
| 30 |
|
---|
| 31 | #if defined(HAVE_FIPS) && \
|
---|
| 32 | defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2)
|
---|
| 33 |
|
---|
| 34 | /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
|
---|
| 35 | #define FIPS_NO_WRAPPERS
|
---|
| 36 |
|
---|
| 37 | #ifdef USE_WINDOWS_API
|
---|
| 38 | #pragma code_seg(".fipsA$k")
|
---|
| 39 | #pragma const_seg(".fipsB$k")
|
---|
| 40 | #endif
|
---|
| 41 | #endif
|
---|
| 42 |
|
---|
[337] | 43 | #include <wolfssl/wolfcrypt/sha512.h>
|
---|
| 44 | #include <wolfssl/wolfcrypt/error-crypt.h>
|
---|
| 45 | #include <wolfssl/wolfcrypt/cpuid.h>
|
---|
| 46 |
|
---|
[372] | 47 | /* deprecated USE_SLOW_SHA2 (replaced with USE_SLOW_SHA512) */
|
---|
| 48 | #if defined(USE_SLOW_SHA2) && !defined(USE_SLOW_SHA512)
|
---|
| 49 | #define USE_SLOW_SHA512
|
---|
| 50 | #endif
|
---|
| 51 |
|
---|
[337] | 52 | /* fips wrapper calls, user can call direct */
|
---|
[372] | 53 | #if defined(HAVE_FIPS) && \
|
---|
| 54 | (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION < 2))
|
---|
| 55 |
|
---|
| 56 | #ifdef WOLFSSL_SHA512
|
---|
| 57 |
|
---|
[337] | 58 | int wc_InitSha512(wc_Sha512* sha)
|
---|
| 59 | {
|
---|
| 60 | if (sha == NULL) {
|
---|
| 61 | return BAD_FUNC_ARG;
|
---|
| 62 | }
|
---|
| 63 |
|
---|
| 64 | return InitSha512_fips(sha);
|
---|
| 65 | }
|
---|
| 66 | int wc_InitSha512_ex(wc_Sha512* sha, void* heap, int devId)
|
---|
| 67 | {
|
---|
| 68 | (void)heap;
|
---|
| 69 | (void)devId;
|
---|
| 70 | if (sha == NULL) {
|
---|
| 71 | return BAD_FUNC_ARG;
|
---|
| 72 | }
|
---|
| 73 | return InitSha512_fips(sha);
|
---|
| 74 | }
|
---|
| 75 | int wc_Sha512Update(wc_Sha512* sha, const byte* data, word32 len)
|
---|
| 76 | {
|
---|
| 77 | if (sha == NULL || (data == NULL && len > 0)) {
|
---|
| 78 | return BAD_FUNC_ARG;
|
---|
| 79 | }
|
---|
| 80 |
|
---|
| 81 | return Sha512Update_fips(sha, data, len);
|
---|
| 82 | }
|
---|
| 83 | int wc_Sha512Final(wc_Sha512* sha, byte* out)
|
---|
| 84 | {
|
---|
| 85 | if (sha == NULL || out == NULL) {
|
---|
| 86 | return BAD_FUNC_ARG;
|
---|
| 87 | }
|
---|
| 88 |
|
---|
| 89 | return Sha512Final_fips(sha, out);
|
---|
| 90 | }
|
---|
| 91 | void wc_Sha512Free(wc_Sha512* sha)
|
---|
| 92 | {
|
---|
| 93 | (void)sha;
|
---|
| 94 | /* Not supported in FIPS */
|
---|
| 95 | }
|
---|
[372] | 96 | #endif
|
---|
[337] | 97 |
|
---|
| 98 | #if defined(WOLFSSL_SHA384) || defined(HAVE_AESGCM)
|
---|
| 99 | int wc_InitSha384(wc_Sha384* sha)
|
---|
| 100 | {
|
---|
| 101 | if (sha == NULL) {
|
---|
| 102 | return BAD_FUNC_ARG;
|
---|
| 103 | }
|
---|
| 104 | return InitSha384_fips(sha);
|
---|
| 105 | }
|
---|
| 106 | int wc_InitSha384_ex(wc_Sha384* sha, void* heap, int devId)
|
---|
| 107 | {
|
---|
| 108 | (void)heap;
|
---|
| 109 | (void)devId;
|
---|
| 110 | if (sha == NULL) {
|
---|
| 111 | return BAD_FUNC_ARG;
|
---|
| 112 | }
|
---|
| 113 | return InitSha384_fips(sha);
|
---|
| 114 | }
|
---|
| 115 | int wc_Sha384Update(wc_Sha384* sha, const byte* data, word32 len)
|
---|
| 116 | {
|
---|
| 117 | if (sha == NULL || (data == NULL && len > 0)) {
|
---|
| 118 | return BAD_FUNC_ARG;
|
---|
| 119 | }
|
---|
| 120 | return Sha384Update_fips(sha, data, len);
|
---|
| 121 | }
|
---|
| 122 | int wc_Sha384Final(wc_Sha384* sha, byte* out)
|
---|
| 123 | {
|
---|
| 124 | if (sha == NULL || out == NULL) {
|
---|
| 125 | return BAD_FUNC_ARG;
|
---|
| 126 | }
|
---|
| 127 | return Sha384Final_fips(sha, out);
|
---|
| 128 | }
|
---|
| 129 | void wc_Sha384Free(wc_Sha384* sha)
|
---|
| 130 | {
|
---|
| 131 | (void)sha;
|
---|
| 132 | /* Not supported in FIPS */
|
---|
| 133 | }
|
---|
| 134 | #endif /* WOLFSSL_SHA384 || HAVE_AESGCM */
|
---|
| 135 |
|
---|
[372] | 136 | #else /* else build without fips, or for FIPS v2 */
|
---|
[337] | 137 |
|
---|
| 138 | #include <wolfssl/wolfcrypt/logging.h>
|
---|
| 139 |
|
---|
| 140 | #ifdef NO_INLINE
|
---|
| 141 | #include <wolfssl/wolfcrypt/misc.h>
|
---|
| 142 | #else
|
---|
| 143 | #define WOLFSSL_MISC_INCLUDED
|
---|
| 144 | #include <wolfcrypt/src/misc.c>
|
---|
| 145 | #endif
|
---|
| 146 |
|
---|
| 147 |
|
---|
| 148 | #if defined(USE_INTEL_SPEEDUP)
|
---|
[372] | 149 | #if defined(__GNUC__) && ((__GNUC__ < 4) || \
|
---|
| 150 | (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
|
---|
| 151 | #undef NO_AVX2_SUPPORT
|
---|
| 152 | #define NO_AVX2_SUPPORT
|
---|
| 153 | #endif
|
---|
| 154 | #if defined(__clang__) && ((__clang_major__ < 3) || \
|
---|
| 155 | (__clang_major__ == 3 && __clang_minor__ <= 5))
|
---|
| 156 | #define NO_AVX2_SUPPORT
|
---|
| 157 | #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
|
---|
| 158 | #undef NO_AVX2_SUPPORT
|
---|
| 159 | #endif
|
---|
| 160 |
|
---|
[337] | 161 | #define HAVE_INTEL_AVX1
|
---|
[372] | 162 | #ifndef NO_AVX2_SUPPORT
|
---|
[337] | 163 | #define HAVE_INTEL_AVX2
|
---|
| 164 | #endif
|
---|
[372] | 165 | #endif
|
---|
[337] | 166 |
|
---|
| 167 | #if defined(HAVE_INTEL_AVX1)
|
---|
| 168 | /* #define DEBUG_XMM */
|
---|
| 169 | #endif
|
---|
| 170 |
|
---|
| 171 | #if defined(HAVE_INTEL_AVX2)
|
---|
| 172 | #define HAVE_INTEL_RORX
|
---|
| 173 | /* #define DEBUG_YMM */
|
---|
| 174 | #endif
|
---|
| 175 |
|
---|
| 176 | #if defined(HAVE_BYTEREVERSE64) && \
|
---|
| 177 | !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2)
|
---|
| 178 | #define ByteReverseWords64(out, in, size) ByteReverseWords64_1(out, size)
|
---|
| 179 | #define ByteReverseWords64_1(buf, size) \
|
---|
| 180 | { unsigned int i ;\
|
---|
| 181 | for(i=0; i< size/sizeof(word64); i++){\
|
---|
| 182 | __asm__ volatile("bswapq %0":"+r"(buf[i])::) ;\
|
---|
| 183 | }\
|
---|
| 184 | }
|
---|
| 185 | #endif
|
---|
| 186 |
|
---|
[372] | 187 | #if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH)
|
---|
| 188 | /* functions defined in wolfcrypt/src/port/caam/caam_sha.c */
|
---|
| 189 | #else
|
---|
| 190 |
|
---|
| 191 | #ifdef WOLFSSL_SHA512
|
---|
| 192 |
|
---|
[337] | 193 | static int InitSha512(wc_Sha512* sha512)
|
---|
| 194 | {
|
---|
| 195 | if (sha512 == NULL)
|
---|
| 196 | return BAD_FUNC_ARG;
|
---|
| 197 |
|
---|
| 198 | sha512->digest[0] = W64LIT(0x6a09e667f3bcc908);
|
---|
| 199 | sha512->digest[1] = W64LIT(0xbb67ae8584caa73b);
|
---|
| 200 | sha512->digest[2] = W64LIT(0x3c6ef372fe94f82b);
|
---|
| 201 | sha512->digest[3] = W64LIT(0xa54ff53a5f1d36f1);
|
---|
| 202 | sha512->digest[4] = W64LIT(0x510e527fade682d1);
|
---|
| 203 | sha512->digest[5] = W64LIT(0x9b05688c2b3e6c1f);
|
---|
| 204 | sha512->digest[6] = W64LIT(0x1f83d9abfb41bd6b);
|
---|
| 205 | sha512->digest[7] = W64LIT(0x5be0cd19137e2179);
|
---|
| 206 |
|
---|
| 207 | sha512->buffLen = 0;
|
---|
| 208 | sha512->loLen = 0;
|
---|
| 209 | sha512->hiLen = 0;
|
---|
| 210 |
|
---|
| 211 | return 0;
|
---|
| 212 | }
|
---|
| 213 |
|
---|
[372] | 214 | #endif /* WOLFSSL_SHA512 */
|
---|
[337] | 215 |
|
---|
| 216 | /* Hardware Acceleration */
|
---|
| 217 | #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
|
---|
| 218 |
|
---|
[372] | 219 | #ifdef WOLFSSL_SHA512
|
---|
| 220 |
|
---|
[337] | 221 | /*****
|
---|
| 222 | Intel AVX1/AVX2 Macro Control Structure
|
---|
| 223 |
|
---|
| 224 | #if defined(HAVE_INteL_SPEEDUP)
|
---|
| 225 | #define HAVE_INTEL_AVX1
|
---|
| 226 | #define HAVE_INTEL_AVX2
|
---|
| 227 | #endif
|
---|
| 228 |
|
---|
| 229 | int InitSha512(wc_Sha512* sha512) {
|
---|
| 230 | Save/Recover XMM, YMM
|
---|
| 231 | ...
|
---|
| 232 |
|
---|
| 233 | Check Intel AVX cpuid flags
|
---|
| 234 | }
|
---|
| 235 |
|
---|
| 236 | #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
|
---|
[372] | 237 | Transform_Sha512_AVX1(); # Function prototype
|
---|
| 238 | Transform_Sha512_AVX2(); #
|
---|
[337] | 239 | #endif
|
---|
| 240 |
|
---|
[372] | 241 | _Transform_Sha512() { # Native Transform Function body
|
---|
[337] | 242 |
|
---|
| 243 | }
|
---|
| 244 |
|
---|
| 245 | int Sha512Update() {
|
---|
| 246 | Save/Recover XMM, YMM
|
---|
| 247 | ...
|
---|
| 248 | }
|
---|
| 249 |
|
---|
| 250 | int Sha512Final() {
|
---|
| 251 | Save/Recover XMM, YMM
|
---|
| 252 | ...
|
---|
| 253 | }
|
---|
| 254 |
|
---|
| 255 |
|
---|
| 256 | #if defined(HAVE_INTEL_AVX1)
|
---|
| 257 |
|
---|
| 258 | XMM Instructions/INLINE asm Definitions
|
---|
| 259 |
|
---|
| 260 | #endif
|
---|
| 261 |
|
---|
| 262 | #if defined(HAVE_INTEL_AVX2)
|
---|
| 263 |
|
---|
| 264 | YMM Instructions/INLINE asm Definitions
|
---|
| 265 |
|
---|
| 266 | #endif
|
---|
| 267 |
|
---|
| 268 | #if defnied(HAVE_INTEL_AVX1)
|
---|
| 269 |
|
---|
[372] | 270 | int Transform_Sha512_AVX1() {
|
---|
[337] | 271 | Stitched Message Sched/Round
|
---|
| 272 | }
|
---|
| 273 |
|
---|
| 274 | #endif
|
---|
| 275 |
|
---|
| 276 | #if defnied(HAVE_INTEL_AVX2)
|
---|
| 277 |
|
---|
[372] | 278 | int Transform_Sha512_AVX2() {
|
---|
[337] | 279 | Stitched Message Sched/Round
|
---|
| 280 | }
|
---|
| 281 | #endif
|
---|
| 282 |
|
---|
| 283 | */
|
---|
| 284 |
|
---|
| 285 |
|
---|
| 286 | /* Each platform needs to query info type 1 from cpuid to see if aesni is
|
---|
| 287 | * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
|
---|
| 288 | */
|
---|
| 289 |
|
---|
| 290 | #if defined(HAVE_INTEL_AVX1)
|
---|
[372] | 291 | static int Transform_Sha512_AVX1(wc_Sha512 *sha512);
|
---|
| 292 | static int Transform_Sha512_AVX1_Len(wc_Sha512 *sha512, word32 len);
|
---|
[337] | 293 | #endif
|
---|
| 294 | #if defined(HAVE_INTEL_AVX2)
|
---|
[372] | 295 | static int Transform_Sha512_AVX2(wc_Sha512 *sha512);
|
---|
| 296 | static int Transform_Sha512_AVX2_Len(wc_Sha512 *sha512, word32 len);
|
---|
| 297 | #if defined(HAVE_INTEL_RORX)
|
---|
| 298 | static int Transform_Sha512_AVX1_RORX(wc_Sha512 *sha512);
|
---|
| 299 | static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512 *sha512,
|
---|
| 300 | word32 len);
|
---|
| 301 | static int Transform_Sha512_AVX2_RORX(wc_Sha512 *sha512);
|
---|
| 302 | static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512 *sha512,
|
---|
| 303 | word32 len);
|
---|
[337] | 304 | #endif
|
---|
| 305 | #endif
|
---|
[372] | 306 | static int _Transform_Sha512(wc_Sha512 *sha512);
|
---|
| 307 | static int (*Transform_Sha512_p)(wc_Sha512* sha512) = _Transform_Sha512;
|
---|
| 308 | static int (*Transform_Sha512_Len_p)(wc_Sha512* sha512, word32 len) = NULL;
|
---|
[337] | 309 | static int transform_check = 0;
|
---|
| 310 | static int intel_flags;
|
---|
[372] | 311 | #define Transform_Sha512(sha512) (*Transform_Sha512_p)(sha512)
|
---|
| 312 | #define Transform_Sha512_Len(sha512, len) \
|
---|
| 313 | (*Transform_Sha512_Len_p)(sha512, len)
|
---|
[337] | 314 |
|
---|
| 315 | static void Sha512_SetTransform()
|
---|
| 316 | {
|
---|
| 317 | if (transform_check)
|
---|
| 318 | return;
|
---|
| 319 |
|
---|
| 320 | intel_flags = cpuid_get_flags();
|
---|
| 321 |
|
---|
| 322 | #if defined(HAVE_INTEL_AVX2)
|
---|
[372] | 323 | if (IS_INTEL_AVX2(intel_flags)) {
|
---|
| 324 | #ifdef HAVE_INTEL_RORX
|
---|
| 325 | if (IS_INTEL_BMI2(intel_flags)) {
|
---|
| 326 | Transform_Sha512_p = Transform_Sha512_AVX2_RORX;
|
---|
| 327 | Transform_Sha512_Len_p = Transform_Sha512_AVX2_RORX_Len;
|
---|
| 328 | }
|
---|
[337] | 329 | else
|
---|
[372] | 330 | #endif
|
---|
| 331 | if (1) {
|
---|
| 332 | Transform_Sha512_p = Transform_Sha512_AVX2;
|
---|
| 333 | Transform_Sha512_Len_p = Transform_Sha512_AVX2_Len;
|
---|
| 334 | }
|
---|
| 335 | #ifdef HAVE_INTEL_RORX
|
---|
| 336 | else {
|
---|
| 337 | Transform_Sha512_p = Transform_Sha512_AVX1_RORX;
|
---|
| 338 | Transform_Sha512_Len_p = Transform_Sha512_AVX1_RORX_Len;
|
---|
| 339 | }
|
---|
| 340 | #endif
|
---|
[337] | 341 | }
|
---|
| 342 | else
|
---|
| 343 | #endif
|
---|
| 344 | #if defined(HAVE_INTEL_AVX1)
|
---|
[372] | 345 | if (IS_INTEL_AVX1(intel_flags)) {
|
---|
| 346 | Transform_Sha512_p = Transform_Sha512_AVX1;
|
---|
| 347 | Transform_Sha512_Len_p = Transform_Sha512_AVX1_Len;
|
---|
[337] | 348 | }
|
---|
| 349 | else
|
---|
| 350 | #endif
|
---|
[372] | 351 | Transform_Sha512_p = _Transform_Sha512;
|
---|
[337] | 352 |
|
---|
| 353 | transform_check = 1;
|
---|
| 354 | }
|
---|
[372] | 355 | #endif /* WOLFSSL_SHA512 */
|
---|
[337] | 356 |
|
---|
[372] | 357 | #else
|
---|
| 358 | #define Transform_Sha512(sha512) _Transform_Sha512(sha512)
|
---|
[337] | 359 |
|
---|
[372] | 360 | #endif
|
---|
[337] | 361 |
|
---|
[372] | 362 | #ifdef WOLFSSL_SHA512
|
---|
[337] | 363 |
|
---|
| 364 | int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId)
|
---|
| 365 | {
|
---|
| 366 | int ret = 0;
|
---|
| 367 |
|
---|
| 368 | if (sha512 == NULL)
|
---|
| 369 | return BAD_FUNC_ARG;
|
---|
| 370 |
|
---|
| 371 | sha512->heap = heap;
|
---|
| 372 |
|
---|
| 373 | ret = InitSha512(sha512);
|
---|
| 374 | if (ret != 0)
|
---|
| 375 | return ret;
|
---|
| 376 |
|
---|
[372] | 377 | #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
|
---|
| 378 | Sha512_SetTransform();
|
---|
| 379 | #endif
|
---|
| 380 |
|
---|
| 381 | #ifdef WOLFSSL_SMALL_STACK_CACHE
|
---|
| 382 | sha512->W = NULL;
|
---|
| 383 | #endif
|
---|
| 384 |
|
---|
[337] | 385 | #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
|
---|
| 386 | ret = wolfAsync_DevCtxInit(&sha512->asyncDev,
|
---|
| 387 | WOLFSSL_ASYNC_MARKER_SHA512, sha512->heap, devId);
|
---|
| 388 | #else
|
---|
| 389 | (void)devId;
|
---|
| 390 | #endif /* WOLFSSL_ASYNC_CRYPT */
|
---|
| 391 |
|
---|
| 392 | return ret;
|
---|
| 393 | }
|
---|
| 394 |
|
---|
[372] | 395 | #endif /* WOLFSSL_SHA512 */
|
---|
[337] | 396 |
|
---|
| 397 |
|
---|
| 398 | static const word64 K512[80] = {
|
---|
| 399 | W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
|
---|
| 400 | W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
|
---|
| 401 | W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
|
---|
| 402 | W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
|
---|
| 403 | W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
|
---|
| 404 | W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
|
---|
| 405 | W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
|
---|
| 406 | W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
|
---|
| 407 | W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
|
---|
| 408 | W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
|
---|
| 409 | W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
|
---|
| 410 | W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
|
---|
| 411 | W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
|
---|
| 412 | W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
|
---|
| 413 | W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
|
---|
| 414 | W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
|
---|
| 415 | W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
|
---|
| 416 | W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
|
---|
| 417 | W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
|
---|
| 418 | W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
|
---|
| 419 | W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
|
---|
| 420 | W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
|
---|
| 421 | W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
|
---|
| 422 | W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
|
---|
| 423 | W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
|
---|
| 424 | W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
|
---|
| 425 | W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
|
---|
| 426 | W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
|
---|
| 427 | W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
|
---|
| 428 | W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
|
---|
| 429 | W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
|
---|
| 430 | W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
|
---|
| 431 | W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
|
---|
| 432 | W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
|
---|
| 433 | W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
|
---|
| 434 | W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
|
---|
| 435 | W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
|
---|
| 436 | W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
|
---|
| 437 | W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
|
---|
| 438 | W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
|
---|
| 439 | };
|
---|
| 440 |
|
---|
| 441 | #define blk0(i) (W[i] = sha512->buffer[i])
|
---|
| 442 |
|
---|
[372] | 443 | #define blk2(i) (\
|
---|
| 444 | W[ i & 15] += \
|
---|
| 445 | s1(W[(i-2) & 15])+ \
|
---|
| 446 | W[(i-7) & 15] + \
|
---|
| 447 | s0(W[(i-15) & 15]) \
|
---|
| 448 | )
|
---|
[337] | 449 |
|
---|
| 450 | #define Ch(x,y,z) (z^(x&(y^z)))
|
---|
| 451 | #define Maj(x,y,z) ((x&y)|(z&(x|y)))
|
---|
| 452 |
|
---|
| 453 | #define a(i) T[(0-i)&7]
|
---|
| 454 | #define b(i) T[(1-i)&7]
|
---|
| 455 | #define c(i) T[(2-i)&7]
|
---|
| 456 | #define d(i) T[(3-i)&7]
|
---|
| 457 | #define e(i) T[(4-i)&7]
|
---|
| 458 | #define f(i) T[(5-i)&7]
|
---|
| 459 | #define g(i) T[(6-i)&7]
|
---|
| 460 | #define h(i) T[(7-i)&7]
|
---|
| 461 |
|
---|
| 462 | #define S0(x) (rotrFixed64(x,28)^rotrFixed64(x,34)^rotrFixed64(x,39))
|
---|
| 463 | #define S1(x) (rotrFixed64(x,14)^rotrFixed64(x,18)^rotrFixed64(x,41))
|
---|
| 464 | #define s0(x) (rotrFixed64(x,1)^rotrFixed64(x,8)^(x>>7))
|
---|
| 465 | #define s1(x) (rotrFixed64(x,19)^rotrFixed64(x,61)^(x>>6))
|
---|
| 466 |
|
---|
[372] | 467 | #define R(i) \
|
---|
| 468 | h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j ? blk2(i) : blk0(i)); \
|
---|
| 469 | d(i) += h(i); \
|
---|
| 470 | h(i) += S0(a(i)) + Maj(a(i),b(i),c(i))
|
---|
[337] | 471 |
|
---|
[372] | 472 | static int _Transform_Sha512(wc_Sha512* sha512)
|
---|
[337] | 473 | {
|
---|
| 474 | const word64* K = K512;
|
---|
| 475 | word32 j;
|
---|
| 476 | word64 T[8];
|
---|
| 477 |
|
---|
[372] | 478 | #ifdef WOLFSSL_SMALL_STACK_CACHE
|
---|
| 479 | word64* W = sha512->W;
|
---|
| 480 | if (W == NULL) {
|
---|
| 481 | W = (word64*) XMALLOC(sizeof(word64) * 16, NULL,
|
---|
| 482 | DYNAMIC_TYPE_TMP_BUFFER);
|
---|
| 483 | if (W == NULL)
|
---|
| 484 | return MEMORY_E;
|
---|
| 485 | sha512->W = W;
|
---|
| 486 | }
|
---|
| 487 | #elif defined(WOLFSSL_SMALL_STACK)
|
---|
[337] | 488 | word64* W;
|
---|
| 489 | W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER);
|
---|
| 490 | if (W == NULL)
|
---|
| 491 | return MEMORY_E;
|
---|
| 492 | #else
|
---|
| 493 | word64 W[16];
|
---|
| 494 | #endif
|
---|
| 495 |
|
---|
| 496 | /* Copy digest to working vars */
|
---|
| 497 | XMEMCPY(T, sha512->digest, sizeof(T));
|
---|
| 498 |
|
---|
[372] | 499 | #ifdef USE_SLOW_SHA512
|
---|
[337] | 500 | /* over twice as small, but 50% slower */
|
---|
| 501 | /* 80 operations, not unrolled */
|
---|
| 502 | for (j = 0; j < 80; j += 16) {
|
---|
| 503 | int m;
|
---|
| 504 | for (m = 0; m < 16; m++) { /* braces needed here for macros {} */
|
---|
| 505 | R(m);
|
---|
| 506 | }
|
---|
| 507 | }
|
---|
| 508 | #else
|
---|
| 509 | /* 80 operations, partially loop unrolled */
|
---|
| 510 | for (j = 0; j < 80; j += 16) {
|
---|
| 511 | R( 0); R( 1); R( 2); R( 3);
|
---|
| 512 | R( 4); R( 5); R( 6); R( 7);
|
---|
| 513 | R( 8); R( 9); R(10); R(11);
|
---|
| 514 | R(12); R(13); R(14); R(15);
|
---|
| 515 | }
|
---|
[372] | 516 | #endif /* USE_SLOW_SHA512 */
|
---|
[337] | 517 |
|
---|
| 518 | /* Add the working vars back into digest */
|
---|
| 519 | sha512->digest[0] += a(0);
|
---|
| 520 | sha512->digest[1] += b(0);
|
---|
| 521 | sha512->digest[2] += c(0);
|
---|
| 522 | sha512->digest[3] += d(0);
|
---|
| 523 | sha512->digest[4] += e(0);
|
---|
| 524 | sha512->digest[5] += f(0);
|
---|
| 525 | sha512->digest[6] += g(0);
|
---|
| 526 | sha512->digest[7] += h(0);
|
---|
| 527 |
|
---|
| 528 | /* Wipe variables */
|
---|
| 529 | ForceZero(W, sizeof(word64) * 16);
|
---|
| 530 | ForceZero(T, sizeof(T));
|
---|
| 531 |
|
---|
[372] | 532 | #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE)
|
---|
[337] | 533 | XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
|
---|
| 534 | #endif
|
---|
| 535 |
|
---|
| 536 | return 0;
|
---|
| 537 | }
|
---|
| 538 |
|
---|
| 539 |
|
---|
[372] | 540 | static WC_INLINE void AddLength(wc_Sha512* sha512, word32 len)
|
---|
[337] | 541 | {
|
---|
| 542 | word64 tmp = sha512->loLen;
|
---|
| 543 | if ( (sha512->loLen += len) < tmp)
|
---|
| 544 | sha512->hiLen++; /* carry low to high */
|
---|
| 545 | }
|
---|
| 546 |
|
---|
[372] | 547 | static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 len)
|
---|
[337] | 548 | {
|
---|
| 549 | int ret = 0;
|
---|
| 550 | /* do block size increments */
|
---|
| 551 | byte* local = (byte*)sha512->buffer;
|
---|
| 552 |
|
---|
| 553 | /* check that internal buffLen is valid */
|
---|
| 554 | if (sha512->buffLen >= WC_SHA512_BLOCK_SIZE)
|
---|
| 555 | return BUFFER_E;
|
---|
| 556 |
|
---|
[372] | 557 | if (sha512->buffLen > 0) {
|
---|
[337] | 558 | word32 add = min(len, WC_SHA512_BLOCK_SIZE - sha512->buffLen);
|
---|
[372] | 559 | if (add > 0) {
|
---|
[337] | 560 | XMEMCPY(&local[sha512->buffLen], data, add);
|
---|
| 561 |
|
---|
| 562 | sha512->buffLen += add;
|
---|
| 563 | data += add;
|
---|
| 564 | len -= add;
|
---|
[372] | 565 | }
|
---|
[337] | 566 |
|
---|
| 567 | if (sha512->buffLen == WC_SHA512_BLOCK_SIZE) {
|
---|
| 568 | #if defined(LITTLE_ENDIAN_ORDER)
|
---|
| 569 | #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
|
---|
| 570 | if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
|
---|
| 571 | #endif
|
---|
| 572 | {
|
---|
| 573 | ByteReverseWords64(sha512->buffer, sha512->buffer,
|
---|
| 574 | WC_SHA512_BLOCK_SIZE);
|
---|
| 575 | }
|
---|
| 576 | #endif
|
---|
[372] | 577 | ret = Transform_Sha512(sha512);
|
---|
| 578 | if (ret == 0) {
|
---|
| 579 | AddLength(sha512, WC_SHA512_BLOCK_SIZE);
|
---|
| 580 | sha512->buffLen = 0;
|
---|
| 581 | }
|
---|
| 582 | else
|
---|
| 583 | len = 0;
|
---|
| 584 | }
|
---|
| 585 | }
|
---|
| 586 |
|
---|
| 587 | #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
|
---|
| 588 | if (Transform_Sha512_Len_p != NULL) {
|
---|
| 589 | word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1);
|
---|
| 590 |
|
---|
| 591 | if (blocksLen > 0) {
|
---|
| 592 | AddLength(sha512, blocksLen);
|
---|
| 593 | sha512->data = data;
|
---|
| 594 | /* Byte reversal performed in function if required. */
|
---|
| 595 | Transform_Sha512_Len(sha512, blocksLen);
|
---|
| 596 | data += blocksLen;
|
---|
| 597 | len -= blocksLen;
|
---|
| 598 | }
|
---|
| 599 | }
|
---|
| 600 | else
|
---|
| 601 | #endif
|
---|
| 602 | #if !defined(LITTLE_ENDIAN_ORDER) || defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
|
---|
| 603 | {
|
---|
| 604 | word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1);
|
---|
| 605 |
|
---|
| 606 | AddLength(sha512, blocksLen);
|
---|
| 607 | while (len >= WC_SHA512_BLOCK_SIZE) {
|
---|
| 608 | XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE);
|
---|
| 609 |
|
---|
| 610 | data += WC_SHA512_BLOCK_SIZE;
|
---|
| 611 | len -= WC_SHA512_BLOCK_SIZE;
|
---|
| 612 |
|
---|
| 613 | /* Byte reversal performed in function if required. */
|
---|
| 614 | ret = Transform_Sha512(sha512);
|
---|
[337] | 615 | if (ret != 0)
|
---|
| 616 | break;
|
---|
[372] | 617 | }
|
---|
| 618 | }
|
---|
| 619 | #else
|
---|
| 620 | {
|
---|
| 621 | word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1);
|
---|
[337] | 622 |
|
---|
[372] | 623 | AddLength(sha512, blocksLen);
|
---|
| 624 | while (len >= WC_SHA512_BLOCK_SIZE) {
|
---|
| 625 | XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE);
|
---|
| 626 |
|
---|
| 627 | data += WC_SHA512_BLOCK_SIZE;
|
---|
| 628 | len -= WC_SHA512_BLOCK_SIZE;
|
---|
| 629 |
|
---|
| 630 | ByteReverseWords64(sha512->buffer, sha512->buffer,
|
---|
| 631 | WC_SHA512_BLOCK_SIZE);
|
---|
| 632 | ret = Transform_Sha512(sha512);
|
---|
| 633 | if (ret != 0)
|
---|
| 634 | break;
|
---|
[337] | 635 | }
|
---|
| 636 | }
|
---|
[372] | 637 | #endif
|
---|
[337] | 638 |
|
---|
[372] | 639 | if (len > 0) {
|
---|
| 640 | XMEMCPY(local, data, len);
|
---|
| 641 | sha512->buffLen = len;
|
---|
| 642 | }
|
---|
| 643 |
|
---|
[337] | 644 | return ret;
|
---|
| 645 | }
|
---|
| 646 |
|
---|
[372] | 647 | #ifdef WOLFSSL_SHA512
|
---|
| 648 |
|
---|
[337] | 649 | int wc_Sha512Update(wc_Sha512* sha512, const byte* data, word32 len)
|
---|
| 650 | {
|
---|
| 651 | if (sha512 == NULL || (data == NULL && len > 0)) {
|
---|
| 652 | return BAD_FUNC_ARG;
|
---|
| 653 | }
|
---|
| 654 |
|
---|
| 655 | #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
|
---|
| 656 | if (sha512->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA512) {
|
---|
| 657 | #if defined(HAVE_INTEL_QA)
|
---|
| 658 | return IntelQaSymSha512(&sha512->asyncDev, NULL, data, len);
|
---|
| 659 | #endif
|
---|
| 660 | }
|
---|
| 661 | #endif /* WOLFSSL_ASYNC_CRYPT */
|
---|
| 662 |
|
---|
| 663 | return Sha512Update(sha512, data, len);
|
---|
| 664 | }
|
---|
| 665 |
|
---|
[372] | 666 | #endif /* WOLFSSL_SHA512 */
|
---|
[337] | 667 |
|
---|
[372] | 668 | #endif /* WOLFSSL_IMX6_CAAM */
|
---|
| 669 |
|
---|
| 670 | static WC_INLINE int Sha512Final(wc_Sha512* sha512)
|
---|
[337] | 671 | {
|
---|
| 672 | byte* local = (byte*)sha512->buffer;
|
---|
| 673 | int ret;
|
---|
| 674 |
|
---|
| 675 | if (sha512 == NULL) {
|
---|
| 676 | return BAD_FUNC_ARG;
|
---|
| 677 | }
|
---|
| 678 |
|
---|
| 679 | AddLength(sha512, sha512->buffLen); /* before adding pads */
|
---|
| 680 |
|
---|
| 681 | local[sha512->buffLen++] = 0x80; /* add 1 */
|
---|
| 682 |
|
---|
| 683 | /* pad with zeros */
|
---|
| 684 | if (sha512->buffLen > WC_SHA512_PAD_SIZE) {
|
---|
| 685 | XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_BLOCK_SIZE - sha512->buffLen);
|
---|
| 686 | sha512->buffLen += WC_SHA512_BLOCK_SIZE - sha512->buffLen;
|
---|
| 687 | #if defined(LITTLE_ENDIAN_ORDER)
|
---|
| 688 | #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
|
---|
| 689 | if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
|
---|
| 690 | #endif
|
---|
| 691 | {
|
---|
| 692 | ByteReverseWords64(sha512->buffer,sha512->buffer,
|
---|
| 693 | WC_SHA512_BLOCK_SIZE);
|
---|
| 694 | }
|
---|
| 695 | #endif /* LITTLE_ENDIAN_ORDER */
|
---|
[372] | 696 | ret = Transform_Sha512(sha512);
|
---|
[337] | 697 | if (ret != 0)
|
---|
| 698 | return ret;
|
---|
| 699 |
|
---|
| 700 | sha512->buffLen = 0;
|
---|
| 701 | }
|
---|
| 702 | XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_PAD_SIZE - sha512->buffLen);
|
---|
| 703 |
|
---|
| 704 | /* put lengths in bits */
|
---|
| 705 | sha512->hiLen = (sha512->loLen >> (8 * sizeof(sha512->loLen) - 3)) +
|
---|
| 706 | (sha512->hiLen << 3);
|
---|
| 707 | sha512->loLen = sha512->loLen << 3;
|
---|
| 708 |
|
---|
| 709 | /* store lengths */
|
---|
| 710 | #if defined(LITTLE_ENDIAN_ORDER)
|
---|
| 711 | #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
|
---|
| 712 | if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
|
---|
| 713 | #endif
|
---|
| 714 | ByteReverseWords64(sha512->buffer, sha512->buffer, WC_SHA512_PAD_SIZE);
|
---|
| 715 | #endif
|
---|
| 716 | /* ! length ordering dependent on digest endian type ! */
|
---|
| 717 |
|
---|
| 718 | sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen;
|
---|
| 719 | sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen;
|
---|
| 720 | #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
|
---|
| 721 | if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
|
---|
| 722 | ByteReverseWords64(&(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
|
---|
| 723 | &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
|
---|
| 724 | WC_SHA512_BLOCK_SIZE - WC_SHA512_PAD_SIZE);
|
---|
| 725 | #endif
|
---|
[372] | 726 | ret = Transform_Sha512(sha512);
|
---|
[337] | 727 | if (ret != 0)
|
---|
| 728 | return ret;
|
---|
| 729 |
|
---|
| 730 | #ifdef LITTLE_ENDIAN_ORDER
|
---|
| 731 | ByteReverseWords64(sha512->digest, sha512->digest, WC_SHA512_DIGEST_SIZE);
|
---|
| 732 | #endif
|
---|
| 733 |
|
---|
| 734 | return 0;
|
---|
| 735 | }
|
---|
| 736 |
|
---|
[372] | 737 | #ifdef WOLFSSL_SHA512
|
---|
| 738 |
|
---|
| 739 | int wc_Sha512FinalRaw(wc_Sha512* sha512, byte* hash)
|
---|
| 740 | {
|
---|
| 741 | #ifdef LITTLE_ENDIAN_ORDER
|
---|
| 742 | word64 digest[WC_SHA512_DIGEST_SIZE / sizeof(word64)];
|
---|
| 743 | #endif
|
---|
| 744 |
|
---|
| 745 | if (sha512 == NULL || hash == NULL) {
|
---|
| 746 | return BAD_FUNC_ARG;
|
---|
| 747 | }
|
---|
| 748 |
|
---|
| 749 | #ifdef LITTLE_ENDIAN_ORDER
|
---|
| 750 | ByteReverseWords64((word64*)digest, (word64*)sha512->digest,
|
---|
| 751 | WC_SHA512_DIGEST_SIZE);
|
---|
| 752 | XMEMCPY(hash, digest, WC_SHA512_DIGEST_SIZE);
|
---|
| 753 | #else
|
---|
| 754 | XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE);
|
---|
| 755 | #endif
|
---|
| 756 |
|
---|
| 757 | return 0;
|
---|
| 758 | }
|
---|
| 759 |
|
---|
[337] | 760 | int wc_Sha512Final(wc_Sha512* sha512, byte* hash)
|
---|
| 761 | {
|
---|
| 762 | int ret;
|
---|
| 763 |
|
---|
| 764 | if (sha512 == NULL || hash == NULL) {
|
---|
| 765 | return BAD_FUNC_ARG;
|
---|
| 766 | }
|
---|
| 767 |
|
---|
| 768 | #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
|
---|
| 769 | if (sha512->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA512) {
|
---|
| 770 | #if defined(HAVE_INTEL_QA)
|
---|
| 771 | return IntelQaSymSha512(&sha512->asyncDev, hash, NULL,
|
---|
| 772 | WC_SHA512_DIGEST_SIZE);
|
---|
| 773 | #endif
|
---|
| 774 | }
|
---|
| 775 | #endif /* WOLFSSL_ASYNC_CRYPT */
|
---|
| 776 |
|
---|
| 777 | ret = Sha512Final(sha512);
|
---|
| 778 | if (ret != 0)
|
---|
| 779 | return ret;
|
---|
| 780 |
|
---|
| 781 | XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE);
|
---|
| 782 |
|
---|
| 783 | return InitSha512(sha512); /* reset state */
|
---|
| 784 | }
|
---|
| 785 |
|
---|
| 786 |
|
---|
| 787 | int wc_InitSha512(wc_Sha512* sha512)
|
---|
| 788 | {
|
---|
| 789 | return wc_InitSha512_ex(sha512, NULL, INVALID_DEVID);
|
---|
| 790 | }
|
---|
| 791 |
|
---|
| 792 | void wc_Sha512Free(wc_Sha512* sha512)
|
---|
| 793 | {
|
---|
| 794 | if (sha512 == NULL)
|
---|
| 795 | return;
|
---|
| 796 |
|
---|
[372] | 797 | #ifdef WOLFSSL_SMALL_STACK_CACHE
|
---|
| 798 | if (sha512->W != NULL) {
|
---|
| 799 | XFREE(sha512->W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
|
---|
| 800 | sha512->W = NULL;
|
---|
| 801 | }
|
---|
| 802 | #endif
|
---|
| 803 |
|
---|
[337] | 804 | #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
|
---|
| 805 | wolfAsync_DevCtxFree(&sha512->asyncDev, WOLFSSL_ASYNC_MARKER_SHA512);
|
---|
| 806 | #endif /* WOLFSSL_ASYNC_CRYPT */
|
---|
| 807 | }
|
---|
| 808 |
|
---|
| 809 |
|
---|
| 810 | #if defined(HAVE_INTEL_AVX1)
|
---|
| 811 |
|
---|
[372] | 812 | static word64 mBYTE_FLIP_MASK[] = { 0x0001020304050607, 0x08090a0b0c0d0e0f };
|
---|
[337] | 813 |
|
---|
[372] | 814 | #define W_0 xmm0
|
---|
| 815 | #define W_2 xmm1
|
---|
| 816 | #define W_4 xmm2
|
---|
| 817 | #define W_6 xmm3
|
---|
| 818 | #define W_8 xmm4
|
---|
| 819 | #define W_10 xmm5
|
---|
| 820 | #define W_12 xmm6
|
---|
| 821 | #define W_14 xmm7
|
---|
[337] | 822 |
|
---|
[372] | 823 | #define W_M15 xmm12
|
---|
| 824 | #define W_M7 xmm13
|
---|
| 825 | #define MASK xmm14
|
---|
[337] | 826 |
|
---|
[372] | 827 | #define XTMP1 xmm8
|
---|
| 828 | #define XTMP2 xmm9
|
---|
| 829 | #define XTMP3 xmm10
|
---|
| 830 | #define XTMP4 xmm11
|
---|
[337] | 831 |
|
---|
[372] | 832 | #define XMM_REGS \
|
---|
| 833 | "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \
|
---|
| 834 | "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
|
---|
[337] | 835 |
|
---|
[372] | 836 | #define _VPALIGNR(dest, src1, src2, bits) \
|
---|
| 837 | "vpalignr $" #bits ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t"
|
---|
| 838 | #define VPALIGNR(dest, src1, src2, bits) \
|
---|
| 839 | _VPALIGNR(dest, src1, src2, bits)
|
---|
[337] | 840 |
|
---|
[372] | 841 | #define _V_SHIFT_R(dest, src, bits) \
|
---|
| 842 | "vpsrlq $" #bits ", %%" #src ", %%" #dest "\n\t"
|
---|
| 843 | #define V_SHIFT_R(dest, src, bits) \
|
---|
| 844 | _V_SHIFT_R(dest, src, bits)
|
---|
[337] | 845 |
|
---|
[372] | 846 | #define _V_SHIFT_L(dest, src, bits) \
|
---|
| 847 | "vpsllq $" #bits ", %%" #src ", %%" #dest "\n\t"
|
---|
| 848 | #define V_SHIFT_L(dest, src, bits) \
|
---|
| 849 | _V_SHIFT_L(dest, src, bits)
|
---|
[337] | 850 |
|
---|
[372] | 851 | #define _V_ADD(dest, src1, src2) \
|
---|
| 852 | "vpaddq %%" #src1 ", %%" #src2 ", %%" #dest "\n\t"
|
---|
| 853 | #define V_ADD(dest, src1, src2) \
|
---|
| 854 | _V_ADD(dest, src1, src2)
|
---|
[337] | 855 |
|
---|
[372] | 856 | #define _V_XOR(dest, src1, src2) \
|
---|
| 857 | "vpxor %%" #src1 ", %%" #src2 ", %%" #dest "\n\t"
|
---|
| 858 | #define V_XOR(dest, src1, src2) \
|
---|
| 859 | _V_XOR(dest, src1, src2)
|
---|
[337] | 860 |
|
---|
[372] | 861 | #define _V_OR(dest, src1, src2) \
|
---|
| 862 | "vpor %%" #src1 ", %%" #src2 ", %%" #dest "\n\t"
|
---|
| 863 | #define V_OR(dest, src1, src2) \
|
---|
| 864 | _V_OR(dest, src1, src2)
|
---|
[337] | 865 |
|
---|
[372] | 866 | #define RA %%r8
|
---|
| 867 | #define RB %%r9
|
---|
| 868 | #define RC %%r10
|
---|
| 869 | #define RD %%r11
|
---|
| 870 | #define RE %%r12
|
---|
| 871 | #define RF %%r13
|
---|
| 872 | #define RG %%r14
|
---|
| 873 | #define RH %%r15
|
---|
[337] | 874 |
|
---|
[372] | 875 | #define STATE_REGS "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
|
---|
[337] | 876 |
|
---|
[372] | 877 | #define L1 "%%rax"
|
---|
| 878 | #define L2 "%%rcx"
|
---|
| 879 | #define L3 "%%rdx"
|
---|
| 880 | #define L4 "%%rbx"
|
---|
| 881 | #define WX "%%rsp"
|
---|
[337] | 882 |
|
---|
[372] | 883 | #define WORK_REGS "rax", "rbx", "rcx", "rdx"
|
---|
[337] | 884 |
|
---|
[372] | 885 | #define RND_0_1(a,b,c,d,e,f,g,h,i) \
|
---|
| 886 | /* L1 = e >>> 23 */ \
|
---|
| 887 | "rorq $23, " L1 "\n\t" \
|
---|
[337] | 888 |
|
---|
[372] | 889 | #define RND_0_2(a,b,c,d,e,f,g,h,i) \
|
---|
| 890 | /* L3 = a */ \
|
---|
| 891 | "movq "#a", " L3 "\n\t" \
|
---|
| 892 | /* L2 = f */ \
|
---|
| 893 | "movq "#f", " L2 "\n\t" \
|
---|
| 894 | /* h += W_X[i] */ \
|
---|
| 895 | "addq ("#i")*8(" WX "), "#h"\n\t" \
|
---|
| 896 | /* L2 = f ^ g */ \
|
---|
| 897 | "xorq "#g", " L2 "\n\t" \
|
---|
[337] | 898 |
|
---|
[372] | 899 | #define RND_0_2_A(a,b,c,d,e,f,g,h,i) \
|
---|
| 900 | /* L3 = a */ \
|
---|
| 901 | "movq "#a", " L3 "\n\t" \
|
---|
| 902 | /* L2 = f */ \
|
---|
| 903 | "movq "#f", " L2 "\n\t" \
|
---|
[337] | 904 |
|
---|
[372] | 905 | #define RND_0_2_B(a,b,c,d,e,f,g,h,i) \
|
---|
| 906 | /* h += W_X[i] */ \
|
---|
| 907 | "addq ("#i")*8(" WX "), "#h"\n\t" \
|
---|
| 908 | /* L2 = f ^ g */ \
|
---|
| 909 | "xorq "#g", " L2 "\n\t" \
|
---|
[337] | 910 |
|
---|
[372] | 911 | #define RND_0_3(a,b,c,d,e,f,g,h,i) \
|
---|
| 912 | /* L1 = (e >>> 23) ^ e */ \
|
---|
| 913 | "xorq "#e", " L1 "\n\t" \
|
---|
| 914 | /* L2 = (f ^ g) & e */ \
|
---|
| 915 | "andq "#e", " L2 "\n\t" \
|
---|
[337] | 916 |
|
---|
[372] | 917 | #define RND_0_4(a,b,c,d,e,f,g,h,i) \
|
---|
| 918 | /* L1 = ((e >>> 23) ^ e) >>> 4 */ \
|
---|
| 919 | "rorq $4, " L1 "\n\t" \
|
---|
| 920 | /* L2 = ((f ^ g) & e) ^ g */ \
|
---|
| 921 | "xorq "#g", " L2 "\n\t" \
|
---|
[337] | 922 |
|
---|
[372] | 923 | #define RND_0_5(a,b,c,d,e,f,g,h,i) \
|
---|
| 924 | /* L1 = (((e >>> 23) ^ e) >>> 4) ^ e */ \
|
---|
| 925 | "xorq "#e", " L1 "\n\t" \
|
---|
| 926 | /* h += Ch(e,f,g) */ \
|
---|
| 927 | "addq " L2 ", "#h"\n\t" \
|
---|
[337] | 928 |
|
---|
[372] | 929 | #define RND_0_6(a,b,c,d,e,f,g,h,i) \
|
---|
| 930 | /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \
|
---|
| 931 | "rorq $14, " L1 "\n\t" \
|
---|
| 932 | /* L3 = a ^ b */ \
|
---|
| 933 | "xorq "#b", " L3 "\n\t" \
|
---|
[337] | 934 |
|
---|
[372] | 935 | #define RND_0_7(a,b,c,d,e,f,g,h,i) \
|
---|
| 936 | /* h += Sigma1(e) */ \
|
---|
| 937 | "addq " L1 ", "#h"\n\t" \
|
---|
| 938 | /* L2 = a */ \
|
---|
| 939 | "movq "#a", " L2 "\n\t" \
|
---|
[337] | 940 |
|
---|
[372] | 941 | #define RND_0_8(a,b,c,d,e,f,g,h,i) \
|
---|
| 942 | /* L4 = (a ^ b) & (b ^ c) */ \
|
---|
| 943 | "andq " L3 ", " L4 "\n\t" \
|
---|
| 944 | /* L2 = a >>> 5 */ \
|
---|
| 945 | "rorq $5, " L2 "\n\t" \
|
---|
[337] | 946 |
|
---|
[372] | 947 | #define RND_0_9(a,b,c,d,e,f,g,h,i) \
|
---|
| 948 | /* L2 = (a >>> 5) ^ a */ \
|
---|
| 949 | "xorq "#a", " L2 "\n\t" \
|
---|
| 950 | /* L4 = ((a ^ b) & (b ^ c) ^ b */ \
|
---|
| 951 | "xorq "#b", " L4 "\n\t" \
|
---|
[337] | 952 |
|
---|
[372] | 953 | #define RND_0_10(a,b,c,d,e,f,g,h,i) \
|
---|
| 954 | /* L2 = ((a >>> 5) ^ a) >>> 6 */ \
|
---|
| 955 | "rorq $6, " L2 "\n\t" \
|
---|
| 956 | /* d += h */ \
|
---|
| 957 | "addq "#h", "#d"\n\t" \
|
---|
[337] | 958 |
|
---|
[372] | 959 | #define RND_0_11(a,b,c,d,e,f,g,h,i) \
|
---|
| 960 | /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */ \
|
---|
| 961 | "xorq "#a", " L2 "\n\t" \
|
---|
| 962 | /* h += Sigma0(a) */ \
|
---|
| 963 | "addq " L4 ", "#h"\n\t" \
|
---|
[337] | 964 |
|
---|
[372] | 965 | #define RND_0_12(a,b,c,d,e,f,g,h,i) \
|
---|
| 966 | /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */ \
|
---|
| 967 | "rorq $28, " L2 "\n\t" \
|
---|
| 968 | /* d (= e next RND) */ \
|
---|
| 969 | "movq "#d", " L1 "\n\t" \
|
---|
| 970 | /* h += Maj(a,b,c) */ \
|
---|
| 971 | "addq " L2 ", "#h"\n\t" \
|
---|
[337] | 972 |
|
---|
[372] | 973 | #define RND_1_1(a,b,c,d,e,f,g,h,i) \
|
---|
| 974 | /* L1 = e >>> 23 */ \
|
---|
| 975 | "rorq $23, " L1 "\n\t" \
|
---|
[337] | 976 |
|
---|
[372] | 977 | #define RND_1_2(a,b,c,d,e,f,g,h,i) \
|
---|
| 978 | /* L4 = a */ \
|
---|
| 979 | "movq "#a", " L4 "\n\t" \
|
---|
| 980 | /* L2 = f */ \
|
---|
| 981 | "movq "#f", " L2 "\n\t" \
|
---|
| 982 | /* h += W_X[i] */ \
|
---|
| 983 | "addq ("#i")*8(" WX "), "#h"\n\t" \
|
---|
| 984 | /* L2 = f ^ g */ \
|
---|
| 985 | "xorq "#g", " L2 "\n\t" \
|
---|
[337] | 986 |
|
---|
[372] | 987 | #define RND_1_2_A(a,b,c,d,e,f,g,h,i) \
|
---|
| 988 | /* L4 = a */ \
|
---|
| 989 | "movq "#a", " L4 "\n\t" \
|
---|
| 990 | /* L2 = f */ \
|
---|
| 991 | "movq "#f", " L2 "\n\t" \
|
---|
[337] | 992 |
|
---|
[372] | 993 | #define RND_1_2_B(a,b,c,d,e,f,g,h,i) \
|
---|
| 994 | /* h += W_X[i] */ \
|
---|
| 995 | "addq ("#i")*8(" WX "), "#h"\n\t" \
|
---|
| 996 | /* L2 = f ^ g */ \
|
---|
| 997 | "xorq "#g", " L2 "\n\t" \
|
---|
[337] | 998 |
|
---|
[372] | 999 | #define RND_1_3(a,b,c,d,e,f,g,h,i) \
|
---|
| 1000 | /* L1 = (e >>> 23) ^ e */ \
|
---|
| 1001 | "xorq "#e", " L1 "\n\t" \
|
---|
| 1002 | /* L2 = (f ^ g) & e */ \
|
---|
| 1003 | "andq "#e", " L2 "\n\t" \
|
---|
[337] | 1004 |
|
---|
[372] | 1005 | #define RND_1_4(a,b,c,d,e,f,g,h,i) \
|
---|
| 1006 | /* ((e >>> 23) ^ e) >>> 4 */ \
|
---|
| 1007 | "rorq $4, " L1 "\n\t" \
|
---|
| 1008 | /* ((f ^ g) & e) ^ g */ \
|
---|
| 1009 | "xorq "#g", " L2 "\n\t" \
|
---|
[337] | 1010 |
|
---|
[372] | 1011 | #define RND_1_5(a,b,c,d,e,f,g,h,i) \
|
---|
| 1012 | /* (((e >>> 23) ^ e) >>> 4) ^ e */ \
|
---|
| 1013 | "xorq "#e", " L1 "\n\t" \
|
---|
| 1014 | /* h += Ch(e,f,g) */ \
|
---|
| 1015 | "addq " L2 ", "#h"\n\t" \
|
---|
[337] | 1016 |
|
---|
[372] | 1017 | #define RND_1_6(a,b,c,d,e,f,g,h,i) \
|
---|
| 1018 | /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \
|
---|
| 1019 | "rorq $14, " L1 "\n\t" \
|
---|
| 1020 | /* L4 = a ^ b */ \
|
---|
| 1021 | "xorq "#b", " L4 "\n\t" \
|
---|
[337] | 1022 |
|
---|
[372] | 1023 | #define RND_1_7(a,b,c,d,e,f,g,h,i) \
|
---|
| 1024 | /* h += Sigma1(e) */ \
|
---|
| 1025 | "addq " L1 ", "#h"\n\t" \
|
---|
| 1026 | /* L2 = a */ \
|
---|
| 1027 | "movq "#a", " L2 "\n\t" \
|
---|
[337] | 1028 |
|
---|
[372] | 1029 | #define RND_1_8(a,b,c,d,e,f,g,h,i) \
|
---|
| 1030 | /* L3 = (a ^ b) & (b ^ c) */ \
|
---|
| 1031 | "andq " L4 ", " L3 "\n\t" \
|
---|
| 1032 | /* L2 = a >>> 5 */ \
|
---|
| 1033 | "rorq $5, " L2 "\n\t" \
|
---|
[337] | 1034 |
|
---|
[372] | 1035 | #define RND_1_9(a,b,c,d,e,f,g,h,i) \
|
---|
| 1036 | /* L2 = (a >>> 5) ^ a */ \
|
---|
| 1037 | "xorq "#a", " L2 "\n\t" \
|
---|
| 1038 | /* L3 = ((a ^ b) & (b ^ c) ^ b */ \
|
---|
| 1039 | "xorq "#b", " L3 "\n\t" \
|
---|
[337] | 1040 |
|
---|
[372] | 1041 | #define RND_1_10(a,b,c,d,e,f,g,h,i) \
|
---|
| 1042 | /* L2 = ((a >>> 5) ^ a) >>> 6 */ \
|
---|
| 1043 | "rorq $6, " L2 "\n\t" \
|
---|
| 1044 | /* d += h */ \
|
---|
| 1045 | "addq "#h", "#d"\n\t" \
|
---|
[337] | 1046 |
|
---|
[372] | 1047 | #define RND_1_11(a,b,c,d,e,f,g,h,i) \
|
---|
| 1048 | /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */ \
|
---|
| 1049 | "xorq "#a", " L2 "\n\t" \
|
---|
| 1050 | /* h += Sigma0(a) */ \
|
---|
| 1051 | "addq " L3 ", "#h"\n\t" \
|
---|
[337] | 1052 |
|
---|
[372] | 1053 | #define RND_1_12(a,b,c,d,e,f,g,h,i) \
|
---|
| 1054 | /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */ \
|
---|
| 1055 | "rorq $28, " L2 "\n\t" \
|
---|
| 1056 | /* d (= e next RND) */ \
|
---|
| 1057 | "movq "#d", " L1 "\n\t" \
|
---|
| 1058 | /* h += Maj(a,b,c) */ \
|
---|
| 1059 | "addq " L2 ", "#h"\n\t" \
|
---|
[337] | 1060 |
|
---|
| 1061 |
|
---|
[372] | 1062 | #define MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \
|
---|
| 1063 | RND_0_1(a,b,c,d,e,f,g,h,i) \
|
---|
| 1064 | VPALIGNR(W_M15, W_2, W_0, 8) \
|
---|
| 1065 | VPALIGNR(W_M7, W_10, W_8, 8) \
|
---|
| 1066 | RND_0_2(a,b,c,d,e,f,g,h,i) \
|
---|
| 1067 | V_SHIFT_R(XTMP1, W_M15, 1) \
|
---|
| 1068 | V_SHIFT_L(XTMP2, W_M15, 63) \
|
---|
| 1069 | RND_0_3(a,b,c,d,e,f,g,h,i) \
|
---|
| 1070 | RND_0_4(a,b,c,d,e,f,g,h,i) \
|
---|
| 1071 | V_SHIFT_R(XTMP3, W_M15, 8) \
|
---|
| 1072 | V_SHIFT_L(XTMP4, W_M15, 56) \
|
---|
| 1073 | RND_0_5(a,b,c,d,e,f,g,h,i) \
|
---|
| 1074 | RND_0_6(a,b,c,d,e,f,g,h,i) \
|
---|
| 1075 | V_OR(XTMP1, XTMP2, XTMP1) \
|
---|
| 1076 | V_OR(XTMP3, XTMP4, XTMP3) \
|
---|
| 1077 | RND_0_7(a,b,c,d,e,f,g,h,i) \
|
---|
| 1078 | RND_0_8(a,b,c,d,e,f,g,h,i) \
|
---|
| 1079 | V_SHIFT_R(XTMP4, W_M15, 7) \
|
---|
| 1080 | V_XOR(XTMP1, XTMP3, XTMP1) \
|
---|
| 1081 | RND_0_9(a,b,c,d,e,f,g,h,i) \
|
---|
| 1082 | RND_0_10(a,b,c,d,e,f,g,h,i) \
|
---|
| 1083 | V_XOR(XTMP1, XTMP4, XTMP1) \
|
---|
| 1084 | V_ADD(W_0, W_0, W_M7) \
|
---|
| 1085 | RND_0_11(a,b,c,d,e,f,g,h,i) \
|
---|
| 1086 | RND_0_12(a,b,c,d,e,f,g,h,i) \
|
---|
| 1087 | RND_1_1(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1088 | V_ADD(W_0, W_0, XTMP1) \
|
---|
| 1089 | RND_1_2(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1090 | V_SHIFT_R(XTMP1, W_14, 19) \
|
---|
| 1091 | V_SHIFT_L(XTMP2, W_14, 45) \
|
---|
| 1092 | RND_1_3(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1093 | RND_1_4(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1094 | V_SHIFT_R(XTMP3, W_14, 61) \
|
---|
| 1095 | V_SHIFT_L(XTMP4, W_14, 3) \
|
---|
| 1096 | RND_1_5(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1097 | RND_1_6(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1098 | RND_1_7(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1099 | V_OR(XTMP1, XTMP2, XTMP1) \
|
---|
| 1100 | V_OR(XTMP3, XTMP4, XTMP3) \
|
---|
| 1101 | RND_1_8(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1102 | RND_1_9(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1103 | V_XOR(XTMP1, XTMP3, XTMP1) \
|
---|
| 1104 | V_SHIFT_R(XTMP4, W_14, 6) \
|
---|
| 1105 | RND_1_10(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1106 | RND_1_11(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1107 | V_XOR(XTMP1, XTMP4, XTMP1) \
|
---|
| 1108 | RND_1_12(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1109 | V_ADD(W_0, W_0, XTMP1) \
|
---|
[337] | 1110 |
|
---|
[372] | 1111 | #define RND_ALL_2(a, b, c, d, e, f, g, h, i) \
|
---|
| 1112 | RND_0_1 (a, b, c, d, e, f, g, h, i ) \
|
---|
| 1113 | RND_0_2 (a, b, c, d, e, f, g, h, i ) \
|
---|
| 1114 | RND_0_3 (a, b, c, d, e, f, g, h, i ) \
|
---|
| 1115 | RND_0_4 (a, b, c, d, e, f, g, h, i ) \
|
---|
| 1116 | RND_0_5 (a, b, c, d, e, f, g, h, i ) \
|
---|
| 1117 | RND_0_6 (a, b, c, d, e, f, g, h, i ) \
|
---|
| 1118 | RND_0_7 (a, b, c, d, e, f, g, h, i ) \
|
---|
| 1119 | RND_0_8 (a, b, c, d, e, f, g, h, i ) \
|
---|
| 1120 | RND_0_9 (a, b, c, d, e, f, g, h, i ) \
|
---|
| 1121 | RND_0_10(a, b, c, d, e, f, g, h, i ) \
|
---|
| 1122 | RND_0_11(a, b, c, d, e, f, g, h, i ) \
|
---|
| 1123 | RND_0_12(a, b, c, d, e, f, g, h, i ) \
|
---|
| 1124 | RND_1_1 (h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1125 | RND_1_2 (h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1126 | RND_1_3 (h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1127 | RND_1_4 (h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1128 | RND_1_5 (h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1129 | RND_1_6 (h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1130 | RND_1_7 (h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1131 | RND_1_8 (h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1132 | RND_1_9 (h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1133 | RND_1_10(h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1134 | RND_1_11(h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1135 | RND_1_12(h, a, b, c, d, e, f, g, i+1)
|
---|
[337] | 1136 |
|
---|
| 1137 |
|
---|
[372] | 1138 | #if defined(HAVE_INTEL_RORX)
|
---|
[337] | 1139 |
|
---|
[372] | 1140 | #define RND_RORX_0_1(a, b, c, d, e, f, g, h, i) \
|
---|
| 1141 | /* L1 = e>>>14 */ \
|
---|
| 1142 | "rorxq $14, "#e", " L1 "\n\t" \
|
---|
| 1143 | /* L2 = e>>>18 */ \
|
---|
| 1144 | "rorxq $18, "#e", " L2 "\n\t" \
|
---|
| 1145 | /* Prev RND: h += Maj(a,b,c) */ \
|
---|
| 1146 | "addq " L3 ", "#a"\n\t" \
|
---|
[337] | 1147 |
|
---|
[372] | 1148 | #define RND_RORX_0_2(a, b, c, d, e, f, g, h, i) \
|
---|
| 1149 | /* h += w_k */ \
|
---|
| 1150 | "addq ("#i")*8(" WX "), "#h"\n\t" \
|
---|
| 1151 | /* L3 = f */ \
|
---|
| 1152 | "movq "#f", " L3 "\n\t" \
|
---|
| 1153 | /* L2 = (e>>>14) ^ (e>>>18) */ \
|
---|
| 1154 | "xorq " L1 ", " L2 "\n\t" \
|
---|
[337] | 1155 |
|
---|
[372] | 1156 | #define RND_RORX_0_3(a, b, c, d, e, f, g, h, i) \
|
---|
| 1157 | /* L3 = f ^ g */ \
|
---|
| 1158 | "xorq "#g", " L3 "\n\t" \
|
---|
| 1159 | /* L1 = e>>>41 */ \
|
---|
| 1160 | "rorxq $41, "#e", " L1 "\n\t" \
|
---|
| 1161 | /* L1 = Sigma1(e) */ \
|
---|
| 1162 | "xorq " L2 ", " L1 "\n\t" \
|
---|
[337] | 1163 |
|
---|
[372] | 1164 | #define RND_RORX_0_4(a, b, c, d, e, f, g, h, i) \
|
---|
| 1165 | /* L3 = (f ^ g) & e */ \
|
---|
| 1166 | "andq "#e", " L3 "\n\t" \
|
---|
| 1167 | /* h += Sigma1(e) */ \
|
---|
| 1168 | "addq " L1 ", "#h"\n\t" \
|
---|
| 1169 | /* L1 = a>>>28 */ \
|
---|
| 1170 | "rorxq $28, "#a", " L1 "\n\t" \
|
---|
[337] | 1171 |
|
---|
[372] | 1172 | #define RND_RORX_0_5(a, b, c, d, e, f, g, h, i) \
|
---|
| 1173 | /* L2 = a>>>34 */ \
|
---|
| 1174 | "rorxq $34, "#a", " L2 "\n\t" \
|
---|
| 1175 | /* L3 = Ch(e,f,g) */ \
|
---|
| 1176 | "xorq "#g", " L3 "\n\t" \
|
---|
| 1177 | /* L2 = (a>>>28) ^ (a>>>34) */ \
|
---|
| 1178 | "xorq " L1 ", " L2 "\n\t" \
|
---|
[337] | 1179 |
|
---|
[372] | 1180 | #define RND_RORX_0_6(a, b, c, d, e, f, g, h, i) \
|
---|
| 1181 | /* L1 = a>>>39 */ \
|
---|
| 1182 | "rorxq $39, "#a", " L1 "\n\t" \
|
---|
| 1183 | /* h += Ch(e,f,g) */ \
|
---|
| 1184 | "addq " L3 ", "#h"\n\t" \
|
---|
| 1185 | /* L1 = Sigma0(a) */ \
|
---|
| 1186 | "xorq " L2 ", " L1 "\n\t" \
|
---|
[337] | 1187 |
|
---|
[372] | 1188 | #define RND_RORX_0_7(a, b, c, d, e, f, g, h, i) \
|
---|
| 1189 | /* L3 = b */ \
|
---|
| 1190 | "movq "#b", " L3 "\n\t" \
|
---|
| 1191 | /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
|
---|
| 1192 | "addq "#h", "#d"\n\t" \
|
---|
| 1193 | /* L3 = a ^ b */ \
|
---|
| 1194 | "xorq "#a", " L3 "\n\t" \
|
---|
[337] | 1195 |
|
---|
[372] | 1196 | #define RND_RORX_0_8(a, b, c, d, e, f, g, h, i) \
|
---|
| 1197 | /* L4 = (a ^ b) & (b ^ c) */ \
|
---|
| 1198 | "andq " L3 ", " L4 "\n\t" \
|
---|
| 1199 | /* h += Sigma0(a) */ \
|
---|
| 1200 | "addq " L1 ", "#h"\n\t" \
|
---|
| 1201 | /* L4 = Maj(a,b,c) */ \
|
---|
| 1202 | "xorq "#b", " L4 "\n\t" \
|
---|
[337] | 1203 |
|
---|
[372] | 1204 | #define RND_RORX_1_1(a, b, c, d, e, f, g, h, i) \
|
---|
| 1205 | /* L1 = e>>>14 */ \
|
---|
| 1206 | "rorxq $14, "#e", " L1 "\n\t" \
|
---|
| 1207 | /* L2 = e>>>18 */ \
|
---|
| 1208 | "rorxq $18, "#e", " L2 "\n\t" \
|
---|
| 1209 | /* Prev RND: h += Maj(a,b,c) */ \
|
---|
| 1210 | "addq " L4 ", "#a"\n\t" \
|
---|
[337] | 1211 |
|
---|
[372] | 1212 | #define RND_RORX_1_2(a, b, c, d, e, f, g, h, i) \
|
---|
| 1213 | /* h += w_k */ \
|
---|
| 1214 | "addq ("#i")*8(" WX "), "#h"\n\t" \
|
---|
| 1215 | /* L4 = f */ \
|
---|
| 1216 | "movq "#f", " L4 "\n\t" \
|
---|
| 1217 | /* L2 = (e>>>14) ^ (e>>>18) */ \
|
---|
| 1218 | "xorq " L1 ", " L2 "\n\t" \
|
---|
[337] | 1219 |
|
---|
[372] | 1220 | #define RND_RORX_1_3(a, b, c, d, e, f, g, h, i) \
|
---|
| 1221 | /* L4 = f ^ g */ \
|
---|
| 1222 | "xorq "#g", " L4 "\n\t" \
|
---|
| 1223 | /* L1 = e>>>41 */ \
|
---|
| 1224 | "rorxq $41, "#e", " L1 "\n\t" \
|
---|
| 1225 | /* L1 = Sigma1(e) */ \
|
---|
| 1226 | "xorq " L2 ", " L1 "\n\t" \
|
---|
[337] | 1227 |
|
---|
[372] | 1228 | #define RND_RORX_1_4(a, b, c, d, e, f, g, h, i) \
|
---|
| 1229 | /* L4 = (f ^ g) & e */ \
|
---|
| 1230 | "andq "#e", " L4 "\n\t" \
|
---|
| 1231 | /* h += Sigma1(e) */ \
|
---|
| 1232 | "addq " L1 ", "#h"\n\t" \
|
---|
| 1233 | /* L1 = a>>>28 */ \
|
---|
| 1234 | "rorxq $28, "#a", " L1 "\n\t" \
|
---|
[337] | 1235 |
|
---|
[372] | 1236 | #define RND_RORX_1_5(a, b, c, d, e, f, g, h, i) \
|
---|
| 1237 | /* L2 = a>>>34 */ \
|
---|
| 1238 | "rorxq $34, "#a", " L2 "\n\t" \
|
---|
| 1239 | /* L4 = Ch(e,f,g) */ \
|
---|
| 1240 | "xorq "#g", " L4 "\n\t" \
|
---|
| 1241 | /* L2 = (a>>>28) ^ (a>>>34) */ \
|
---|
| 1242 | "xorq " L1 ", " L2 "\n\t" \
|
---|
[337] | 1243 |
|
---|
[372] | 1244 | #define RND_RORX_1_6(a, b, c, d, e, f, g, h, i) \
|
---|
| 1245 | /* L1 = a>>>39 */ \
|
---|
| 1246 | "rorxq $39, "#a", " L1 "\n\t" \
|
---|
| 1247 | /* h += Ch(e,f,g) */ \
|
---|
| 1248 | "addq " L4 ", "#h"\n\t" \
|
---|
| 1249 | /* L1 = Sigma0(a) */ \
|
---|
| 1250 | "xorq " L2 ", " L1 "\n\t" \
|
---|
[337] | 1251 |
|
---|
[372] | 1252 | #define RND_RORX_1_7(a, b, c, d, e, f, g, h, i) \
|
---|
| 1253 | /* L4 = b */ \
|
---|
| 1254 | "movq "#b", " L4 "\n\t" \
|
---|
| 1255 | /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
|
---|
| 1256 | "addq "#h", "#d"\n\t" \
|
---|
| 1257 | /* L4 = a ^ b */ \
|
---|
| 1258 | "xorq "#a", " L4 "\n\t" \
|
---|
| 1259 |
|
---|
| 1260 | #define RND_RORX_1_8(a, b, c, d, e, f, g, h, i) \
|
---|
| 1261 | /* L2 = (a ^ b) & (b ^ c) */ \
|
---|
| 1262 | "andq " L4 ", " L3 "\n\t" \
|
---|
| 1263 | /* h += Sigma0(a) */ \
|
---|
| 1264 | "addq " L1 ", "#h"\n\t" \
|
---|
| 1265 | /* L3 = Maj(a,b,c) */ \
|
---|
| 1266 | "xorq "#b", " L3 "\n\t" \
|
---|
| 1267 |
|
---|
| 1268 | #define RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i) \
|
---|
| 1269 | RND_RORX_0_1(a, b, c, d, e, f, g, h, i+0) \
|
---|
| 1270 | RND_RORX_0_2(a, b, c, d, e, f, g, h, i+0) \
|
---|
| 1271 | RND_RORX_0_3(a, b, c, d, e, f, g, h, i+0) \
|
---|
| 1272 | RND_RORX_0_4(a, b, c, d, e, f, g, h, i+0) \
|
---|
| 1273 | RND_RORX_0_5(a, b, c, d, e, f, g, h, i+0) \
|
---|
| 1274 | RND_RORX_0_6(a, b, c, d, e, f, g, h, i+0) \
|
---|
| 1275 | RND_RORX_0_7(a, b, c, d, e, f, g, h, i+0) \
|
---|
| 1276 | RND_RORX_0_8(a, b, c, d, e, f, g, h, i+0) \
|
---|
| 1277 | RND_RORX_1_1(h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1278 | RND_RORX_1_2(h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1279 | RND_RORX_1_3(h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1280 | RND_RORX_1_4(h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1281 | RND_RORX_1_5(h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1282 | RND_RORX_1_6(h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1283 | RND_RORX_1_7(h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1284 | RND_RORX_1_8(h, a, b, c, d, e, f, g, i+1) \
|
---|
| 1285 |
|
---|
| 1286 | #define RND_RORX_ALL_4(a, b, c, d, e, f, g, h, i) \
|
---|
| 1287 | RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i+0) \
|
---|
| 1288 | RND_RORX_ALL_2(g, h, a, b, c, d, e, f, i+2)
|
---|
| 1289 |
|
---|
| 1290 | #define MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \
|
---|
| 1291 | RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \
|
---|
| 1292 | VPALIGNR(W_M15, W_2, W_0, 8) \
|
---|
| 1293 | VPALIGNR(W_M7, W_10, W_8, 8) \
|
---|
| 1294 | RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \
|
---|
| 1295 | V_SHIFT_R(XTMP1, W_M15, 1) \
|
---|
| 1296 | V_SHIFT_L(XTMP2, W_M15, 63) \
|
---|
| 1297 | RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \
|
---|
| 1298 | V_SHIFT_R(XTMP3, W_M15, 8) \
|
---|
| 1299 | V_SHIFT_L(XTMP4, W_M15, 56) \
|
---|
| 1300 | RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \
|
---|
| 1301 | V_OR(XTMP1, XTMP2, XTMP1) \
|
---|
| 1302 | V_OR(XTMP3, XTMP4, XTMP3) \
|
---|
| 1303 | RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \
|
---|
| 1304 | V_SHIFT_R(XTMP4, W_M15, 7) \
|
---|
| 1305 | V_XOR(XTMP1, XTMP3, XTMP1) \
|
---|
| 1306 | RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \
|
---|
| 1307 | V_XOR(XTMP1, XTMP4, XTMP1) \
|
---|
| 1308 | V_ADD(W_0, W_0, W_M7) \
|
---|
| 1309 | RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \
|
---|
| 1310 | RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \
|
---|
| 1311 | V_ADD(W_0, W_0, XTMP1) \
|
---|
| 1312 | RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1313 | V_SHIFT_R(XTMP1, W_14, 19) \
|
---|
| 1314 | V_SHIFT_L(XTMP2, W_14, 45) \
|
---|
| 1315 | RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1316 | V_SHIFT_R(XTMP3, W_14, 61) \
|
---|
| 1317 | V_SHIFT_L(XTMP4, W_14, 3) \
|
---|
| 1318 | RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1319 | V_OR(XTMP1, XTMP2, XTMP1) \
|
---|
| 1320 | V_OR(XTMP3, XTMP4, XTMP3) \
|
---|
| 1321 | RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1322 | RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1323 | V_XOR(XTMP1, XTMP3, XTMP1) \
|
---|
| 1324 | V_SHIFT_R(XTMP4, W_14, 6) \
|
---|
| 1325 | RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1326 | RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1327 | V_XOR(XTMP1, XTMP4, XTMP1) \
|
---|
| 1328 | RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1329 | V_ADD(W_0, W_0, XTMP1) \
|
---|
| 1330 |
|
---|
| 1331 | #endif
|
---|
| 1332 |
|
---|
| 1333 | #define _INIT_MASK(mask) \
|
---|
| 1334 | "vmovdqu %[mask], %%" #mask "\n\t"
|
---|
| 1335 | #define INIT_MASK(mask) \
|
---|
| 1336 | _INIT_MASK(mask)
|
---|
| 1337 |
|
---|
| 1338 | #define _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) \
|
---|
| 1339 | "vmovdqu " #i1 "*16(%%" #reg "), %%" #xmm1 "\n\t" \
|
---|
| 1340 | "vmovdqu " #i2 "*16(%%" #reg "), %%" #xmm2 "\n\t" \
|
---|
| 1341 | "vpshufb %%" #mask ", %%" #xmm1 ", %%" #xmm1 "\n\t" \
|
---|
| 1342 | "vpshufb %%" #mask ", %%" #xmm2 ", %%" #xmm2 "\n\t"
|
---|
| 1343 | #define LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) \
|
---|
| 1344 | _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg)
|
---|
| 1345 |
|
---|
| 1346 | #define LOAD_W(mask, reg) \
|
---|
| 1347 | /* X0..3(xmm4..7), W[0..15] = buffer[0.15]; */ \
|
---|
| 1348 | LOAD_W_2(0, 1, W_0 , W_2 , mask, reg) \
|
---|
| 1349 | LOAD_W_2(2, 3, W_4 , W_6 , mask, reg) \
|
---|
| 1350 | LOAD_W_2(4, 5, W_8 , W_10, mask, reg) \
|
---|
| 1351 | LOAD_W_2(6, 7, W_12, W_14, mask, reg)
|
---|
| 1352 |
|
---|
| 1353 | #define _SET_W_X_2(xmm0, xmm1, reg, i) \
|
---|
| 1354 | "vpaddq " #i "+ 0(%%" #reg "), %%" #xmm0 ", %%xmm8\n\t" \
|
---|
| 1355 | "vpaddq " #i "+16(%%" #reg "), %%" #xmm1 ", %%xmm9\n\t" \
|
---|
| 1356 | "vmovdqu %%xmm8, " #i "+ 0(" WX ")\n\t" \
|
---|
| 1357 | "vmovdqu %%xmm9, " #i "+16(" WX ")\n\t" \
|
---|
| 1358 |
|
---|
| 1359 | #define SET_W_X_2(xmm0, xmm1, reg, i) \
|
---|
| 1360 | _SET_W_X_2(xmm0, xmm1, reg, i)
|
---|
| 1361 |
|
---|
| 1362 | #define SET_W_X(reg) \
|
---|
| 1363 | SET_W_X_2(W_0 , W_2 , reg, 0) \
|
---|
| 1364 | SET_W_X_2(W_4 , W_6 , reg, 32) \
|
---|
| 1365 | SET_W_X_2(W_8 , W_10, reg, 64) \
|
---|
| 1366 | SET_W_X_2(W_12, W_14, reg, 96)
|
---|
| 1367 |
|
---|
| 1368 | #define LOAD_DIGEST() \
|
---|
| 1369 | "movq (%[sha512]), %%r8 \n\t" \
|
---|
| 1370 | "movq 8(%[sha512]), %%r9 \n\t" \
|
---|
| 1371 | "movq 16(%[sha512]), %%r10\n\t" \
|
---|
| 1372 | "movq 24(%[sha512]), %%r11\n\t" \
|
---|
| 1373 | "movq 32(%[sha512]), %%r12\n\t" \
|
---|
| 1374 | "movq 40(%[sha512]), %%r13\n\t" \
|
---|
| 1375 | "movq 48(%[sha512]), %%r14\n\t" \
|
---|
| 1376 | "movq 56(%[sha512]), %%r15\n\t"
|
---|
| 1377 |
|
---|
| 1378 | #define STORE_ADD_DIGEST() \
|
---|
| 1379 | "addq %%r8, (%[sha512])\n\t" \
|
---|
| 1380 | "addq %%r9, 8(%[sha512])\n\t" \
|
---|
| 1381 | "addq %%r10, 16(%[sha512])\n\t" \
|
---|
| 1382 | "addq %%r11, 24(%[sha512])\n\t" \
|
---|
| 1383 | "addq %%r12, 32(%[sha512])\n\t" \
|
---|
| 1384 | "addq %%r13, 40(%[sha512])\n\t" \
|
---|
| 1385 | "addq %%r14, 48(%[sha512])\n\t" \
|
---|
| 1386 | "addq %%r15, 56(%[sha512])\n\t"
|
---|
| 1387 |
|
---|
| 1388 | #define ADD_DIGEST() \
|
---|
| 1389 | "addq (%[sha512]), %%r8 \n\t" \
|
---|
| 1390 | "addq 8(%[sha512]), %%r9 \n\t" \
|
---|
| 1391 | "addq 16(%[sha512]), %%r10\n\t" \
|
---|
| 1392 | "addq 24(%[sha512]), %%r11\n\t" \
|
---|
| 1393 | "addq 32(%[sha512]), %%r12\n\t" \
|
---|
| 1394 | "addq 40(%[sha512]), %%r13\n\t" \
|
---|
| 1395 | "addq 48(%[sha512]), %%r14\n\t" \
|
---|
| 1396 | "addq 56(%[sha512]), %%r15\n\t"
|
---|
| 1397 |
|
---|
| 1398 | #define STORE_DIGEST() \
|
---|
| 1399 | "movq %%r8, (%[sha512])\n\t" \
|
---|
| 1400 | "movq %%r9, 8(%[sha512])\n\t" \
|
---|
| 1401 | "movq %%r10, 16(%[sha512])\n\t" \
|
---|
| 1402 | "movq %%r11, 24(%[sha512])\n\t" \
|
---|
| 1403 | "movq %%r12, 32(%[sha512])\n\t" \
|
---|
| 1404 | "movq %%r13, 40(%[sha512])\n\t" \
|
---|
| 1405 | "movq %%r14, 48(%[sha512])\n\t" \
|
---|
| 1406 | "movq %%r15, 56(%[sha512])\n\t"
|
---|
| 1407 |
|
---|
| 1408 | #endif /* HAVE_INTEL_AVX1 */
|
---|
| 1409 |
|
---|
| 1410 |
|
---|
[337] | 1411 | /*** Transform Body ***/
|
---|
| 1412 | #if defined(HAVE_INTEL_AVX1)
|
---|
[372] | 1413 | static int Transform_Sha512_AVX1(wc_Sha512* sha512)
|
---|
[337] | 1414 | {
|
---|
[372] | 1415 | __asm__ __volatile__ (
|
---|
[337] | 1416 |
|
---|
[372] | 1417 | /* 16 Ws plus loop counter. */
|
---|
| 1418 | "subq $136, %%rsp\n\t"
|
---|
| 1419 | "leaq 64(%[sha512]), %%rax\n\t"
|
---|
[337] | 1420 |
|
---|
[372] | 1421 | INIT_MASK(MASK)
|
---|
| 1422 | LOAD_DIGEST()
|
---|
[337] | 1423 |
|
---|
[372] | 1424 | LOAD_W(MASK, rax)
|
---|
[337] | 1425 |
|
---|
[372] | 1426 | "movl $4, 16*8(" WX ")\n\t"
|
---|
| 1427 | "leaq %[K512], %%rsi\n\t"
|
---|
| 1428 | /* b */
|
---|
| 1429 | "movq %%r9, " L4 "\n\t"
|
---|
| 1430 | /* e */
|
---|
| 1431 | "movq %%r12, " L1 "\n\t"
|
---|
| 1432 | /* b ^ c */
|
---|
| 1433 | "xorq %%r10, " L4 "\n\t"
|
---|
[337] | 1434 |
|
---|
[372] | 1435 | "# Start of 16 rounds\n"
|
---|
| 1436 | "1:\n\t"
|
---|
| 1437 |
|
---|
| 1438 | SET_W_X(rsi)
|
---|
| 1439 |
|
---|
| 1440 | "addq $128, %%rsi\n\t"
|
---|
| 1441 |
|
---|
| 1442 | MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 1443 | MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
|
---|
| 1444 | MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
|
---|
| 1445 | MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
|
---|
| 1446 | MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
|
---|
| 1447 | MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
|
---|
| 1448 | MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
|
---|
| 1449 | MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
|
---|
| 1450 |
|
---|
| 1451 | "subl $1, 16*8(" WX ")\n\t"
|
---|
| 1452 | "jne 1b\n\t"
|
---|
| 1453 |
|
---|
| 1454 | SET_W_X(rsi)
|
---|
| 1455 |
|
---|
| 1456 | RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 1457 | RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
|
---|
| 1458 | RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
|
---|
| 1459 | RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
|
---|
| 1460 |
|
---|
| 1461 | RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
|
---|
| 1462 | RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
|
---|
| 1463 | RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
|
---|
| 1464 | RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
|
---|
| 1465 |
|
---|
| 1466 | STORE_ADD_DIGEST()
|
---|
| 1467 |
|
---|
| 1468 | "addq $136, %%rsp\n\t"
|
---|
| 1469 |
|
---|
| 1470 | :
|
---|
| 1471 | : [mask] "m" (mBYTE_FLIP_MASK),
|
---|
| 1472 | [sha512] "r" (sha512),
|
---|
| 1473 | [K512] "m" (K512)
|
---|
| 1474 | : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
|
---|
| 1475 | );
|
---|
| 1476 |
|
---|
| 1477 | return 0;
|
---|
[337] | 1478 | }
|
---|
| 1479 |
|
---|
[372] | 1480 | static int Transform_Sha512_AVX1_Len(wc_Sha512* sha512, word32 len)
|
---|
| 1481 | {
|
---|
| 1482 | __asm__ __volatile__ (
|
---|
[337] | 1483 |
|
---|
[372] | 1484 | "movq 224(%[sha512]), %%rsi\n\t"
|
---|
| 1485 | "leaq %[K512], %%rdx\n\t"
|
---|
[337] | 1486 |
|
---|
[372] | 1487 | INIT_MASK(MASK)
|
---|
| 1488 | LOAD_DIGEST()
|
---|
| 1489 |
|
---|
| 1490 | "# Start of processing a block\n"
|
---|
| 1491 | "2:\n\t"
|
---|
| 1492 |
|
---|
| 1493 | /* 16 Ws plus loop counter and K512. len goes into -4(%rsp).
|
---|
| 1494 | * Debug needs more stack space. */
|
---|
| 1495 | "subq $256, %%rsp\n\t"
|
---|
| 1496 |
|
---|
| 1497 | LOAD_W(MASK, rsi)
|
---|
| 1498 |
|
---|
| 1499 | "movl $4, 16*8(" WX ")\n\t"
|
---|
| 1500 | /* b */
|
---|
| 1501 | "movq %%r9, " L4 "\n\t"
|
---|
| 1502 | /* e */
|
---|
| 1503 | "movq %%r12, " L1 "\n\t"
|
---|
| 1504 | /* b ^ c */
|
---|
| 1505 | "xorq %%r10, " L4 "\n\t"
|
---|
| 1506 |
|
---|
| 1507 | SET_W_X(rdx)
|
---|
| 1508 |
|
---|
| 1509 | "# Start of 16 rounds\n"
|
---|
| 1510 | "1:\n\t"
|
---|
| 1511 |
|
---|
| 1512 | "addq $128, %%rdx\n\t"
|
---|
| 1513 | "movq %%rdx, 17*8(%%rsp)\n\t"
|
---|
| 1514 |
|
---|
| 1515 | MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 1516 | MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
|
---|
| 1517 | MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
|
---|
| 1518 | MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
|
---|
| 1519 | MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
|
---|
| 1520 | MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
|
---|
| 1521 | MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
|
---|
| 1522 | MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
|
---|
| 1523 |
|
---|
| 1524 | "movq 17*8(%%rsp), %%rdx\n\t"
|
---|
| 1525 |
|
---|
| 1526 | SET_W_X(rdx)
|
---|
| 1527 |
|
---|
| 1528 | "subl $1, 16*8(" WX ")\n\t"
|
---|
| 1529 | "jne 1b\n\t"
|
---|
| 1530 |
|
---|
| 1531 | RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 1532 | RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
|
---|
| 1533 | RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
|
---|
| 1534 | RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
|
---|
| 1535 |
|
---|
| 1536 | RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
|
---|
| 1537 | RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
|
---|
| 1538 | RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
|
---|
| 1539 | RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
|
---|
| 1540 |
|
---|
| 1541 | ADD_DIGEST()
|
---|
| 1542 |
|
---|
| 1543 | "addq $256, %%rsp\n\t"
|
---|
| 1544 | "leaq %[K512], %%rdx\n\t"
|
---|
| 1545 | "addq $128, %%rsi\n\t"
|
---|
| 1546 | "subl $128, %[len]\n\t"
|
---|
| 1547 |
|
---|
| 1548 | STORE_DIGEST()
|
---|
| 1549 |
|
---|
| 1550 | "jnz 2b\n\t"
|
---|
| 1551 |
|
---|
| 1552 | :
|
---|
| 1553 | : [mask] "m" (mBYTE_FLIP_MASK),
|
---|
| 1554 | [len] "m" (len),
|
---|
| 1555 | [sha512] "r" (sha512),
|
---|
| 1556 | [K512] "m" (K512)
|
---|
| 1557 | : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
|
---|
| 1558 | );
|
---|
| 1559 |
|
---|
[337] | 1560 | return 0;
|
---|
| 1561 | }
|
---|
| 1562 | #endif /* HAVE_INTEL_AVX1 */
|
---|
| 1563 |
|
---|
[372] | 1564 | #if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX)
|
---|
| 1565 | static int Transform_Sha512_AVX1_RORX(wc_Sha512* sha512)
|
---|
[337] | 1566 | {
|
---|
[372] | 1567 | __asm__ __volatile__ (
|
---|
[337] | 1568 |
|
---|
[372] | 1569 | /* 16 Ws plus loop counter and K512. */
|
---|
| 1570 | "subq $144, %%rsp\n\t"
|
---|
| 1571 | "leaq 64(%[sha512]), %%rax\n\t"
|
---|
[337] | 1572 |
|
---|
[372] | 1573 | INIT_MASK(MASK)
|
---|
| 1574 | LOAD_DIGEST()
|
---|
[337] | 1575 |
|
---|
[372] | 1576 | LOAD_W(MASK, rax)
|
---|
[337] | 1577 |
|
---|
[372] | 1578 | "movl $4, 16*8(" WX ")\n\t"
|
---|
| 1579 | "leaq %[K512], %%rsi\n\t"
|
---|
| 1580 | /* L4 = b */
|
---|
| 1581 | "movq %%r9, " L4 "\n\t"
|
---|
| 1582 | /* L3 = 0 (add to prev h) */
|
---|
| 1583 | "xorq " L3 ", " L3 "\n\t"
|
---|
| 1584 | /* L4 = b ^ c */
|
---|
| 1585 | "xorq %%r10, " L4 "\n\t"
|
---|
[337] | 1586 |
|
---|
[372] | 1587 | SET_W_X(rsi)
|
---|
| 1588 |
|
---|
| 1589 | "# Start of 16 rounds\n"
|
---|
| 1590 | "1:\n\t"
|
---|
| 1591 |
|
---|
| 1592 | "addq $128, %%rsi\n\t"
|
---|
| 1593 |
|
---|
| 1594 | MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 1595 | MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
|
---|
| 1596 | MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
|
---|
| 1597 | MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
|
---|
| 1598 | MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
|
---|
| 1599 | MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
|
---|
| 1600 | MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
|
---|
| 1601 | MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
|
---|
| 1602 |
|
---|
| 1603 | SET_W_X(rsi)
|
---|
| 1604 |
|
---|
| 1605 | "subl $1, 16*8(" WX ")\n\t"
|
---|
| 1606 | "jne 1b\n\t"
|
---|
| 1607 |
|
---|
| 1608 | RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 1609 | RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
|
---|
| 1610 | RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
|
---|
| 1611 | RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
|
---|
| 1612 |
|
---|
| 1613 | RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
|
---|
| 1614 | RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
|
---|
| 1615 | RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
|
---|
| 1616 | RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
|
---|
| 1617 |
|
---|
| 1618 | /* Prev RND: h += Maj(a,b,c) */
|
---|
| 1619 | "addq " L3 ", %%r8\n\t"
|
---|
| 1620 | "addq $144, %%rsp\n\t"
|
---|
| 1621 |
|
---|
| 1622 | STORE_ADD_DIGEST()
|
---|
| 1623 |
|
---|
| 1624 | :
|
---|
| 1625 | : [mask] "m" (mBYTE_FLIP_MASK),
|
---|
| 1626 | [sha512] "r" (sha512),
|
---|
| 1627 | [K512] "m" (K512)
|
---|
| 1628 | : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
|
---|
| 1629 | );
|
---|
| 1630 |
|
---|
| 1631 | return 0;
|
---|
[337] | 1632 | }
|
---|
| 1633 |
|
---|
[372] | 1634 | static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512* sha512, word32 len)
|
---|
| 1635 | {
|
---|
| 1636 | __asm__ __volatile__ (
|
---|
[337] | 1637 |
|
---|
[372] | 1638 | "movq 224(%[sha512]), %%rsi\n\t"
|
---|
| 1639 | "leaq %[K512], %%rcx\n\t"
|
---|
[337] | 1640 |
|
---|
[372] | 1641 | INIT_MASK(MASK)
|
---|
| 1642 | LOAD_DIGEST()
|
---|
| 1643 |
|
---|
| 1644 | "# Start of processing a block\n"
|
---|
| 1645 | "2:\n\t"
|
---|
| 1646 |
|
---|
| 1647 | /* 16 Ws plus loop counter and K512. len goes into -4(%rsp).
|
---|
| 1648 | * Debug needs more stack space. */
|
---|
| 1649 | "subq $256, %%rsp\n\t"
|
---|
| 1650 |
|
---|
| 1651 | LOAD_W(MASK, rsi)
|
---|
| 1652 |
|
---|
| 1653 | "movl $4, 16*8(" WX ")\n\t"
|
---|
| 1654 | /* L4 = b */
|
---|
| 1655 | "movq %%r9, " L4 "\n\t"
|
---|
| 1656 | /* L3 = 0 (add to prev h) */
|
---|
| 1657 | "xorq " L3 ", " L3 "\n\t"
|
---|
| 1658 | /* L4 = b ^ c */
|
---|
| 1659 | "xorq %%r10, " L4 "\n\t"
|
---|
| 1660 |
|
---|
| 1661 | SET_W_X(rcx)
|
---|
| 1662 |
|
---|
| 1663 | "# Start of 16 rounds\n"
|
---|
| 1664 | "1:\n\t"
|
---|
| 1665 |
|
---|
| 1666 | "addq $128, %%rcx\n\t"
|
---|
| 1667 | "movq %%rcx, 17*8(%%rsp)\n\t"
|
---|
| 1668 |
|
---|
| 1669 | MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 1670 | MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
|
---|
| 1671 | MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
|
---|
| 1672 | MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
|
---|
| 1673 | MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
|
---|
| 1674 | MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
|
---|
| 1675 | MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
|
---|
| 1676 | MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
|
---|
| 1677 |
|
---|
| 1678 | "movq 17*8(%%rsp), %%rcx\n\t"
|
---|
| 1679 |
|
---|
| 1680 | SET_W_X(rcx)
|
---|
| 1681 |
|
---|
| 1682 | "subl $1, 16*8(" WX ")\n\t"
|
---|
| 1683 | "jne 1b\n\t"
|
---|
| 1684 |
|
---|
| 1685 | SET_W_X(rcx)
|
---|
| 1686 |
|
---|
| 1687 | RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 1688 | RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
|
---|
| 1689 | RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
|
---|
| 1690 | RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
|
---|
| 1691 |
|
---|
| 1692 | RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
|
---|
| 1693 | RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
|
---|
| 1694 | RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
|
---|
| 1695 | RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
|
---|
| 1696 |
|
---|
| 1697 | /* Prev RND: h += Maj(a,b,c) */
|
---|
| 1698 | "addq " L3 ", %%r8\n\t"
|
---|
| 1699 | "addq $256, %%rsp\n\t"
|
---|
| 1700 |
|
---|
| 1701 | ADD_DIGEST()
|
---|
| 1702 |
|
---|
| 1703 | "leaq %[K512], %%rcx\n\t"
|
---|
| 1704 | "addq $128, %%rsi\n\t"
|
---|
| 1705 | "subl $128, %[len]\n\t"
|
---|
| 1706 |
|
---|
| 1707 | STORE_DIGEST()
|
---|
| 1708 |
|
---|
| 1709 | "jnz 2b\n\t"
|
---|
| 1710 |
|
---|
| 1711 | :
|
---|
| 1712 | : [mask] "m" (mBYTE_FLIP_MASK),
|
---|
| 1713 | [len] "m" (len),
|
---|
| 1714 | [sha512] "r" (sha512),
|
---|
| 1715 | [K512] "m" (K512)
|
---|
| 1716 | : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
|
---|
| 1717 | );
|
---|
| 1718 |
|
---|
[337] | 1719 | return 0;
|
---|
| 1720 | }
|
---|
[372] | 1721 | #endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_RORX */
|
---|
[337] | 1722 |
|
---|
| 1723 | #if defined(HAVE_INTEL_AVX2)
|
---|
[372] | 1724 | static const unsigned long mBYTE_FLIP_MASK_Y[] =
|
---|
| 1725 | { 0x0001020304050607, 0x08090a0b0c0d0e0f,
|
---|
| 1726 | 0x0001020304050607, 0x08090a0b0c0d0e0f };
|
---|
[337] | 1727 |
|
---|
[372] | 1728 | #define W_Y_0 ymm0
|
---|
| 1729 | #define W_Y_4 ymm1
|
---|
| 1730 | #define W_Y_8 ymm2
|
---|
| 1731 | #define W_Y_12 ymm3
|
---|
[337] | 1732 |
|
---|
[372] | 1733 | #define X0 xmm0
|
---|
| 1734 | #define X1 xmm1
|
---|
| 1735 | #define X2 xmm2
|
---|
| 1736 | #define X3 xmm3
|
---|
| 1737 | #define X4 xmm4
|
---|
| 1738 | #define X5 xmm5
|
---|
| 1739 | #define X6 xmm6
|
---|
| 1740 | #define X7 xmm7
|
---|
| 1741 | #define X8 xmm8
|
---|
| 1742 | #define X9 xmm9
|
---|
| 1743 | #define Y0 ymm0
|
---|
| 1744 | #define Y1 ymm1
|
---|
| 1745 | #define Y2 ymm2
|
---|
| 1746 | #define Y3 ymm3
|
---|
| 1747 | #define Y4 ymm4
|
---|
| 1748 | #define Y5 ymm5
|
---|
| 1749 | #define Y6 ymm6
|
---|
| 1750 | #define Y7 ymm7
|
---|
[337] | 1751 |
|
---|
[372] | 1752 | #define W_Y_M15 ymm12
|
---|
| 1753 | #define W_Y_M7 ymm13
|
---|
| 1754 | #define W_Y_M2 ymm14
|
---|
| 1755 | #define MASK_Y ymm15
|
---|
[337] | 1756 |
|
---|
[372] | 1757 | #define YTMP1 ymm8
|
---|
| 1758 | #define YTMP2 ymm9
|
---|
| 1759 | #define YTMP3 ymm10
|
---|
| 1760 | #define YTMP4 ymm11
|
---|
[337] | 1761 |
|
---|
[372] | 1762 | #define YMM_REGS \
|
---|
| 1763 | "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", \
|
---|
| 1764 | "xmm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15"
|
---|
[337] | 1765 |
|
---|
[372] | 1766 | #define _VPERM2I128(dest, src1, src2, sel) \
|
---|
| 1767 | "vperm2I128 $" #sel ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t"
|
---|
| 1768 | #define VPERM2I128(dest, src1, src2, sel) \
|
---|
| 1769 | _VPERM2I128(dest, src1, src2, sel)
|
---|
[337] | 1770 |
|
---|
[372] | 1771 | #define _VPERMQ(dest, src, sel) \
|
---|
| 1772 | "vpermq $" #sel ", %%" #src ", %%" #dest "\n\t"
|
---|
| 1773 | #define VPERMQ(dest, src, sel) \
|
---|
| 1774 | _VPERMQ(dest, src, sel)
|
---|
[337] | 1775 |
|
---|
[372] | 1776 | #define _VPBLENDD(dest, src1, src2, sel) \
|
---|
| 1777 | "vpblendd $" #sel ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t"
|
---|
| 1778 | #define VPBLENDD(dest, src1, src2, sel) \
|
---|
| 1779 | _VPBLENDD(dest, src1, src2, sel)
|
---|
[337] | 1780 |
|
---|
[372] | 1781 | #define _V_ADD_I(dest, src1, addr, i) \
|
---|
| 1782 | "vpaddq "#i"*8(%%" #addr "), %%" #src1 ", %%" #dest "\n\t"
|
---|
| 1783 | #define V_ADD_I(dest, src1, addr, i) \
|
---|
| 1784 | _V_ADD_I(dest, src1, addr, i)
|
---|
[337] | 1785 |
|
---|
[372] | 1786 | #define _VMOVDQU_I(addr, i, src) \
|
---|
| 1787 | "vmovdqu %%" #src ", " #i "*8(%%" #addr ")\n\t"
|
---|
| 1788 | #define VMOVDQU_I(addr, i, src) \
|
---|
| 1789 | _VMOVDQU_I(addr, i, src)
|
---|
[337] | 1790 |
|
---|
[372] | 1791 | #define MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \
|
---|
| 1792 | RND_0_1(a,b,c,d,e,f,g,h,i) \
|
---|
| 1793 | /* W[-13]..W[-15], W[-12] */ \
|
---|
| 1794 | VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03) \
|
---|
| 1795 | /* W[-5]..W[-7], W[-4] */ \
|
---|
| 1796 | VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03) \
|
---|
| 1797 | RND_0_2(a,b,c,d,e,f,g,h,i) \
|
---|
| 1798 | RND_0_3(a,b,c,d,e,f,g,h,i) \
|
---|
| 1799 | /* W_Y_M15 = W[-12]..W[-15] */ \
|
---|
| 1800 | VPERMQ(W_Y_M15, W_Y_M15, 0x39) \
|
---|
| 1801 | RND_0_4(a,b,c,d,e,f,g,h,i) \
|
---|
| 1802 | /* W_Y_M7 = W[-4]..W[-7] */ \
|
---|
| 1803 | VPERMQ(W_Y_M7, W_Y_M7, 0x39) \
|
---|
| 1804 | RND_0_5(a,b,c,d,e,f,g,h,i) \
|
---|
| 1805 | RND_0_6(a,b,c,d,e,f,g,h,i) \
|
---|
| 1806 | /* W[-15] >> 1 */ \
|
---|
| 1807 | V_SHIFT_R(YTMP1, W_Y_M15, 1) \
|
---|
| 1808 | RND_0_7(a,b,c,d,e,f,g,h,i) \
|
---|
| 1809 | /* W[-15] << 63 */ \
|
---|
| 1810 | V_SHIFT_L(YTMP2, W_Y_M15, 63) \
|
---|
| 1811 | RND_0_8(a,b,c,d,e,f,g,h,i) \
|
---|
| 1812 | /* W[-15] >> 8 */ \
|
---|
| 1813 | V_SHIFT_R(YTMP3, W_Y_M15, 8) \
|
---|
| 1814 | RND_0_9(a,b,c,d,e,f,g,h,i) \
|
---|
| 1815 | /* W[-15] << 56 */ \
|
---|
| 1816 | V_SHIFT_L(YTMP4, W_Y_M15, 56) \
|
---|
| 1817 | RND_0_10(a,b,c,d,e,f,g,h,i) \
|
---|
| 1818 | /* W[-15] >>> 1 */ \
|
---|
| 1819 | V_OR(YTMP1, YTMP2, YTMP1) \
|
---|
| 1820 | RND_0_11(a,b,c,d,e,f,g,h,i) \
|
---|
| 1821 | /* W[-15] >>> 8 */ \
|
---|
| 1822 | V_OR(YTMP3, YTMP4, YTMP3) \
|
---|
| 1823 | RND_0_12(a,b,c,d,e,f,g,h,i) \
|
---|
| 1824 | RND_1_1(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1825 | /* W[-15] >> 7 */ \
|
---|
| 1826 | V_SHIFT_R(YTMP4, W_Y_M15, 7) \
|
---|
| 1827 | RND_1_2_A(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1828 | /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */ \
|
---|
| 1829 | V_XOR(YTMP1, YTMP3, YTMP1) \
|
---|
| 1830 | RND_1_2_B(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1831 | /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */ \
|
---|
| 1832 | V_XOR(YTMP1, YTMP4, YTMP1) \
|
---|
| 1833 | RND_1_3(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1834 | /* W[0] = W[-16] + W[-7] */ \
|
---|
| 1835 | V_ADD(W_Y_0, W_Y_0, W_Y_M7) \
|
---|
| 1836 | RND_1_4(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1837 | /* W[0] = W[-16] + W[-7] + s0(W[-15]) */ \
|
---|
| 1838 | V_ADD(W_Y_0, W_Y_0, YTMP1) \
|
---|
| 1839 | RND_1_5(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1840 | /* 0, 0, W[-1], W[-2] */ \
|
---|
| 1841 | VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81) \
|
---|
| 1842 | RND_1_6(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1843 | RND_1_7(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1844 | RND_1_8(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1845 | /* W[-2] >> 19 */ \
|
---|
| 1846 | V_SHIFT_R(YTMP1, W_Y_M2, 19) \
|
---|
| 1847 | RND_1_9(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1848 | /* W[-2] << 45 */ \
|
---|
| 1849 | V_SHIFT_L(YTMP2, W_Y_M2, 45) \
|
---|
| 1850 | RND_1_10(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1851 | /* W[-2] >> 61 */ \
|
---|
| 1852 | V_SHIFT_R(YTMP3, W_Y_M2, 61) \
|
---|
| 1853 | RND_1_11(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1854 | /* W[-2] << 3 */ \
|
---|
| 1855 | V_SHIFT_L(YTMP4, W_Y_M2, 3) \
|
---|
| 1856 | RND_1_12(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1857 | RND_0_1(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 1858 | /* W[-2] >>> 19 */ \
|
---|
| 1859 | V_OR(YTMP1, YTMP2, YTMP1) \
|
---|
| 1860 | RND_0_2(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 1861 | /* W[-2] >>> 61 */ \
|
---|
| 1862 | V_OR(YTMP3, YTMP4, YTMP3) \
|
---|
| 1863 | RND_0_3(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 1864 | /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \
|
---|
| 1865 | V_XOR(YTMP1, YTMP3, YTMP1) \
|
---|
| 1866 | RND_0_4(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 1867 | /* W[-2] >> 6 */ \
|
---|
| 1868 | V_SHIFT_R(YTMP4, W_Y_M2, 6) \
|
---|
| 1869 | RND_0_5(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 1870 | /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \
|
---|
| 1871 | V_XOR(YTMP1, YTMP4, YTMP1) \
|
---|
| 1872 | RND_0_6(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 1873 | /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \
|
---|
| 1874 | V_ADD(W_Y_0, W_Y_0, YTMP1) \
|
---|
| 1875 | RND_0_7(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 1876 | RND_0_8(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 1877 | /* W[1], W[0], 0, 0 */ \
|
---|
| 1878 | VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08) \
|
---|
| 1879 | RND_0_9(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 1880 | RND_0_10(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 1881 | /* W[-2] >> 19 */ \
|
---|
| 1882 | V_SHIFT_R(YTMP1, W_Y_M2, 19) \
|
---|
| 1883 | RND_0_11(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 1884 | /* W[-2] << 45 */ \
|
---|
| 1885 | V_SHIFT_L(YTMP2, W_Y_M2, 45) \
|
---|
| 1886 | RND_0_12(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 1887 | RND_1_1(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 1888 | /* W[-2] >> 61 */ \
|
---|
| 1889 | V_SHIFT_R(YTMP3, W_Y_M2, 61) \
|
---|
| 1890 | RND_1_2(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 1891 | /* W[-2] << 3 */ \
|
---|
| 1892 | V_SHIFT_L(YTMP4, W_Y_M2, 3) \
|
---|
| 1893 | RND_1_3(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 1894 | /* W[-2] >>> 19 */ \
|
---|
| 1895 | V_OR(YTMP1, YTMP2, YTMP1) \
|
---|
| 1896 | RND_1_4(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 1897 | /* W[-2] >>> 61 */ \
|
---|
| 1898 | V_OR(YTMP3, YTMP4, YTMP3) \
|
---|
| 1899 | RND_1_5(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 1900 | /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \
|
---|
| 1901 | V_XOR(YTMP1, YTMP3, YTMP1) \
|
---|
| 1902 | RND_1_6(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 1903 | /* W[-2] >> 6 */ \
|
---|
| 1904 | V_SHIFT_R(YTMP4, W_Y_M2, 6) \
|
---|
| 1905 | RND_1_7(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 1906 | /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \
|
---|
| 1907 | V_XOR(YTMP1, YTMP4, YTMP1) \
|
---|
| 1908 | RND_1_8(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 1909 | /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \
|
---|
| 1910 | V_ADD(W_Y_0, W_Y_0, YTMP1) \
|
---|
| 1911 | RND_1_9(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 1912 | RND_1_10(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 1913 | RND_1_11(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 1914 | RND_1_12(f,g,h,a,b,c,d,e,i+3) \
|
---|
[337] | 1915 |
|
---|
[372] | 1916 | #define MsgSched2_AVX2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \
|
---|
| 1917 | RND_0_1(a,b,c,d,e,f,g,h,i) \
|
---|
| 1918 | VPALIGNR(W_Y_M15, W_2, W_0, 8) \
|
---|
| 1919 | VPALIGNR(W_Y_M7, W_10, W_8, 8) \
|
---|
| 1920 | RND_0_2(a,b,c,d,e,f,g,h,i) \
|
---|
| 1921 | V_SHIFT_R(YTMP1, W_Y_M15, 1) \
|
---|
| 1922 | V_SHIFT_L(YTMP2, W_Y_M15, 63) \
|
---|
| 1923 | RND_0_3(a,b,c,d,e,f,g,h,i) \
|
---|
| 1924 | RND_0_4(a,b,c,d,e,f,g,h,i) \
|
---|
| 1925 | V_SHIFT_R(YTMP3, W_Y_M15, 8) \
|
---|
| 1926 | V_SHIFT_L(YTMP4, W_Y_M15, 56) \
|
---|
| 1927 | RND_0_5(a,b,c,d,e,f,g,h,i) \
|
---|
| 1928 | RND_0_6(a,b,c,d,e,f,g,h,i) \
|
---|
| 1929 | V_OR(YTMP1, YTMP2, YTMP1) \
|
---|
| 1930 | V_OR(YTMP3, YTMP4, YTMP3) \
|
---|
| 1931 | RND_0_7(a,b,c,d,e,f,g,h,i) \
|
---|
| 1932 | RND_0_8(a,b,c,d,e,f,g,h,i) \
|
---|
| 1933 | V_SHIFT_R(YTMP4, W_Y_M15, 7) \
|
---|
| 1934 | V_XOR(YTMP1, YTMP3, YTMP1) \
|
---|
| 1935 | RND_0_9(a,b,c,d,e,f,g,h,i) \
|
---|
| 1936 | RND_0_10(a,b,c,d,e,f,g,h,i) \
|
---|
| 1937 | V_XOR(YTMP1, YTMP4, YTMP1) \
|
---|
| 1938 | V_ADD(W_0, W_0, W_Y_M7) \
|
---|
| 1939 | RND_0_11(a,b,c,d,e,f,g,h,i) \
|
---|
| 1940 | RND_0_12(a,b,c,d,e,f,g,h,i) \
|
---|
| 1941 | RND_1_1(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1942 | V_ADD(W_0, W_0, YTMP1) \
|
---|
| 1943 | RND_1_2(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1944 | V_SHIFT_R(YTMP1, W_14, 19) \
|
---|
| 1945 | V_SHIFT_L(YTMP2, W_14, 45) \
|
---|
| 1946 | RND_1_3(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1947 | RND_1_4(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1948 | V_SHIFT_R(YTMP3, W_14, 61) \
|
---|
| 1949 | V_SHIFT_L(YTMP4, W_14, 3) \
|
---|
| 1950 | RND_1_5(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1951 | RND_1_6(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1952 | RND_1_7(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1953 | V_OR(YTMP1, YTMP2, YTMP1) \
|
---|
| 1954 | V_OR(YTMP3, YTMP4, YTMP3) \
|
---|
| 1955 | RND_1_8(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1956 | RND_1_9(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1957 | V_XOR(YTMP1, YTMP3, YTMP1) \
|
---|
| 1958 | V_SHIFT_R(YTMP4, W_14, 6) \
|
---|
| 1959 | RND_1_10(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1960 | RND_1_11(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1961 | V_XOR(YTMP1, YTMP4, YTMP1) \
|
---|
| 1962 | RND_1_12(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1963 | V_ADD(W_0, W_0, YTMP1) \
|
---|
[337] | 1964 |
|
---|
[372] | 1965 | #define MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \
|
---|
| 1966 | RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \
|
---|
| 1967 | /* W[-13]..W[-15], W[-12] */ \
|
---|
| 1968 | VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03) \
|
---|
| 1969 | /* W[-5]..W[-7], W[-4] */ \
|
---|
| 1970 | VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03) \
|
---|
| 1971 | RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \
|
---|
| 1972 | /* W_Y_M15 = W[-12]..W[-15] */ \
|
---|
| 1973 | VPERMQ(W_Y_M15, W_Y_M15, 0x39) \
|
---|
| 1974 | RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \
|
---|
| 1975 | /* W_Y_M7 = W[-4]..W[-7] */ \
|
---|
| 1976 | VPERMQ(W_Y_M7, W_Y_M7, 0x39) \
|
---|
| 1977 | RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \
|
---|
| 1978 | /* W[-15] >> 1 */ \
|
---|
| 1979 | V_SHIFT_R(YTMP1, W_Y_M15, 1) \
|
---|
| 1980 | /* W[-15] << 63 */ \
|
---|
| 1981 | V_SHIFT_L(YTMP2, W_Y_M15, 63) \
|
---|
| 1982 | RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \
|
---|
| 1983 | /* W[-15] >> 8 */ \
|
---|
| 1984 | V_SHIFT_R(YTMP3, W_Y_M15, 8) \
|
---|
| 1985 | /* W[-15] << 56 */ \
|
---|
| 1986 | V_SHIFT_L(YTMP4, W_Y_M15, 56) \
|
---|
| 1987 | /* W[-15] >>> 1 */ \
|
---|
| 1988 | V_OR(YTMP1, YTMP2, YTMP1) \
|
---|
| 1989 | /* W[-15] >>> 8 */ \
|
---|
| 1990 | V_OR(YTMP3, YTMP4, YTMP3) \
|
---|
| 1991 | RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \
|
---|
| 1992 | /* W[-15] >> 7 */ \
|
---|
| 1993 | V_SHIFT_R(YTMP4, W_Y_M15, 7) \
|
---|
| 1994 | RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \
|
---|
| 1995 | /* 0, 0, W[-1], W[-2] */ \
|
---|
| 1996 | VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81) \
|
---|
| 1997 | RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \
|
---|
| 1998 | RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 1999 | /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */ \
|
---|
| 2000 | V_XOR(YTMP1, YTMP3, YTMP1) \
|
---|
| 2001 | RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2002 | /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */ \
|
---|
| 2003 | V_XOR(YTMP1, YTMP4, YTMP1) \
|
---|
| 2004 | RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2005 | /* W[0] = W[-16] + W[-7] */ \
|
---|
| 2006 | V_ADD(W_Y_0, W_Y_0, W_Y_M7) \
|
---|
| 2007 | /* W[0] = W[-16] + W[-7] + s0(W[-15]) */ \
|
---|
| 2008 | V_ADD(W_Y_0, W_Y_0, YTMP1) \
|
---|
| 2009 | RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2010 | /* W[-2] >> 19 */ \
|
---|
| 2011 | V_SHIFT_R(YTMP1, W_Y_M2, 19) \
|
---|
| 2012 | /* W[-2] << 45 */ \
|
---|
| 2013 | V_SHIFT_L(YTMP2, W_Y_M2, 45) \
|
---|
| 2014 | RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2015 | /* W[-2] >> 61 */ \
|
---|
| 2016 | V_SHIFT_R(YTMP3, W_Y_M2, 61) \
|
---|
| 2017 | /* W[-2] << 3 */ \
|
---|
| 2018 | V_SHIFT_L(YTMP4, W_Y_M2, 3) \
|
---|
| 2019 | /* W[-2] >>> 19 */ \
|
---|
| 2020 | V_OR(YTMP1, YTMP2, YTMP1) \
|
---|
| 2021 | RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2022 | /* W[-2] >>> 61 */ \
|
---|
| 2023 | V_OR(YTMP3, YTMP4, YTMP3) \
|
---|
| 2024 | RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2025 | /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \
|
---|
| 2026 | V_XOR(YTMP1, YTMP3, YTMP1) \
|
---|
| 2027 | RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2028 | /* W[-2] >> 6 */ \
|
---|
| 2029 | V_SHIFT_R(YTMP4, W_Y_M2, 6) \
|
---|
| 2030 | RND_RORX_0_1(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 2031 | /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \
|
---|
| 2032 | V_XOR(YTMP1, YTMP4, YTMP1) \
|
---|
| 2033 | RND_RORX_0_2(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 2034 | /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \
|
---|
| 2035 | V_ADD(W_Y_0, W_Y_0, YTMP1) \
|
---|
| 2036 | RND_RORX_0_3(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 2037 | /* W[1], W[0], 0, 0 */ \
|
---|
| 2038 | VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08) \
|
---|
| 2039 | RND_RORX_0_4(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 2040 | RND_RORX_0_5(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 2041 | /* W[-2] >> 19 */ \
|
---|
| 2042 | V_SHIFT_R(YTMP1, W_Y_M2, 19) \
|
---|
| 2043 | /* W[-2] << 45 */ \
|
---|
| 2044 | V_SHIFT_L(YTMP2, W_Y_M2, 45) \
|
---|
| 2045 | RND_RORX_0_6(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 2046 | /* W[-2] >> 61 */ \
|
---|
| 2047 | V_SHIFT_R(YTMP3, W_Y_M2, 61) \
|
---|
| 2048 | /* W[-2] << 3 */ \
|
---|
| 2049 | V_SHIFT_L(YTMP4, W_Y_M2, 3) \
|
---|
| 2050 | /* W[-2] >>> 19 */ \
|
---|
| 2051 | V_OR(YTMP1, YTMP2, YTMP1) \
|
---|
| 2052 | RND_RORX_0_7(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 2053 | /* W[-2] >>> 61 */ \
|
---|
| 2054 | V_OR(YTMP3, YTMP4, YTMP3) \
|
---|
| 2055 | RND_RORX_0_8(g,h,a,b,c,d,e,f,i+2) \
|
---|
| 2056 | /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \
|
---|
| 2057 | V_XOR(YTMP1, YTMP3, YTMP1) \
|
---|
| 2058 | RND_RORX_1_1(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 2059 | /* W[-2] >> 6 */ \
|
---|
| 2060 | V_SHIFT_R(YTMP4, W_Y_M2, 6) \
|
---|
| 2061 | RND_RORX_1_2(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 2062 | RND_RORX_1_3(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 2063 | /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \
|
---|
| 2064 | V_XOR(YTMP1, YTMP4, YTMP1) \
|
---|
| 2065 | RND_RORX_1_4(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 2066 | RND_RORX_1_5(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 2067 | /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \
|
---|
| 2068 | V_ADD(W_Y_0, W_Y_0, YTMP1) \
|
---|
| 2069 | RND_RORX_1_6(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 2070 | V_ADD_I(YTMP1, W_Y_0, rsi, i) \
|
---|
| 2071 | RND_RORX_1_7(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 2072 | RND_RORX_1_8(f,g,h,a,b,c,d,e,i+3) \
|
---|
| 2073 | VMOVDQU_I(rsp, i, YTMP1) \
|
---|
[337] | 2074 |
|
---|
[372] | 2075 | #define MsgSched2_AVX2_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e, \
|
---|
| 2076 | f,g,h,i) \
|
---|
| 2077 | RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \
|
---|
| 2078 | VPALIGNR(W_Y_M15, W_2, W_0, 8) \
|
---|
| 2079 | VPALIGNR(W_Y_M7, W_10, W_8, 8) \
|
---|
| 2080 | RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \
|
---|
| 2081 | V_SHIFT_R(YTMP1, W_Y_M15, 1) \
|
---|
| 2082 | V_SHIFT_L(YTMP2, W_Y_M15, 63) \
|
---|
| 2083 | RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \
|
---|
| 2084 | V_SHIFT_R(YTMP3, W_Y_M15, 8) \
|
---|
| 2085 | V_SHIFT_L(YTMP4, W_Y_M15, 56) \
|
---|
| 2086 | RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \
|
---|
| 2087 | V_OR(YTMP1, YTMP2, YTMP1) \
|
---|
| 2088 | V_OR(YTMP3, YTMP4, YTMP3) \
|
---|
| 2089 | RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \
|
---|
| 2090 | V_SHIFT_R(YTMP4, W_Y_M15, 7) \
|
---|
| 2091 | V_XOR(YTMP1, YTMP3, YTMP1) \
|
---|
| 2092 | RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \
|
---|
| 2093 | V_XOR(YTMP1, YTMP4, YTMP1) \
|
---|
| 2094 | V_ADD(W_0, W_0, W_Y_M7) \
|
---|
| 2095 | RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \
|
---|
| 2096 | RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \
|
---|
| 2097 | V_ADD(W_0, W_0, YTMP1) \
|
---|
| 2098 | RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2099 | V_SHIFT_R(YTMP1, W_14, 19) \
|
---|
| 2100 | V_SHIFT_L(YTMP2, W_14, 45) \
|
---|
| 2101 | RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2102 | V_SHIFT_R(YTMP3, W_14, 61) \
|
---|
| 2103 | V_SHIFT_L(YTMP4, W_14, 3) \
|
---|
| 2104 | RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2105 | V_OR(YTMP1, YTMP2, YTMP1) \
|
---|
| 2106 | V_OR(YTMP3, YTMP4, YTMP3) \
|
---|
| 2107 | RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2108 | RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2109 | V_XOR(YTMP1, YTMP3, YTMP1) \
|
---|
| 2110 | V_SHIFT_R(YTMP4, W_14, 6) \
|
---|
| 2111 | RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2112 | RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2113 | V_XOR(YTMP1, YTMP4, YTMP1) \
|
---|
| 2114 | RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \
|
---|
| 2115 | V_ADD(W_0, W_0, YTMP1) \
|
---|
[337] | 2116 |
|
---|
| 2117 |
|
---|
[372] | 2118 | #define _INIT_MASK_Y(mask) \
|
---|
| 2119 | "vmovdqu %[mask], %%"#mask"\n\t"
|
---|
| 2120 | #define INIT_MASK_Y(mask) \
|
---|
| 2121 | _INIT_MASK_Y(mask)
|
---|
[337] | 2122 |
|
---|
[372] | 2123 | /* Load into YMM registers and swap endian. */
|
---|
| 2124 | #define _LOAD_BLOCK_W_Y_2(mask, ymm0, ymm1, reg, i) \
|
---|
| 2125 | /* buffer[0..15] => ymm0..ymm3; */ \
|
---|
| 2126 | "vmovdqu " #i "+ 0(%%" #reg "), %%" #ymm0 "\n\t" \
|
---|
| 2127 | "vmovdqu " #i "+32(%%" #reg "), %%" #ymm1 "\n\t" \
|
---|
| 2128 | "vpshufb %%" #mask ", %%" #ymm0 ", %%" #ymm0 "\n\t" \
|
---|
| 2129 | "vpshufb %%" #mask ", %%" #ymm1 ", %%" #ymm1 "\n\t"
|
---|
[337] | 2130 |
|
---|
[372] | 2131 | #define LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i) \
|
---|
| 2132 | _LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i)
|
---|
[337] | 2133 |
|
---|
[372] | 2134 | #define LOAD_BLOCK_W_Y(mask, reg) \
|
---|
| 2135 | LOAD_BLOCK_W_Y_2(mask, W_Y_0, W_Y_4 , reg, 0) \
|
---|
| 2136 | LOAD_BLOCK_W_Y_2(mask, W_Y_8, W_Y_12, reg, 64)
|
---|
[337] | 2137 |
|
---|
[372] | 2138 | #define _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) \
|
---|
| 2139 | "vpaddq " #i "+ 0(%%" #reg "), %%" #ymm0 ", %%" #ymm2 "\n\t" \
|
---|
| 2140 | "vpaddq " #i "+32(%%" #reg "), %%" #ymm1 ", %%" #ymm3 "\n\t" \
|
---|
| 2141 | "vmovdqu %%" #ymm2 ", " #i "+ 0(" WX ")\n\t" \
|
---|
| 2142 | "vmovdqu %%" #ymm3 ", " #i "+32(" WX ")\n\t"
|
---|
[337] | 2143 |
|
---|
[372] | 2144 | #define SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) \
|
---|
| 2145 | _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i)
|
---|
[337] | 2146 |
|
---|
[372] | 2147 | #define SET_BLOCK_W_Y(reg) \
|
---|
| 2148 | SET_W_Y_2(W_Y_0, W_Y_4 , YTMP1, YTMP2, reg, 0) \
|
---|
| 2149 | SET_W_Y_2(W_Y_8, W_Y_12, YTMP1, YTMP2, reg, 64)
|
---|
[337] | 2150 |
|
---|
[372] | 2151 | /* Load into YMM registers and swap endian. */
|
---|
| 2152 | #define _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) \
|
---|
| 2153 | "vmovdqu " #i "+ 0(%%" #reg "), %%" #X0 "\n\t" \
|
---|
| 2154 | "vmovdqu " #i "+ 16(%%" #reg "), %%" #X1 "\n\t" \
|
---|
| 2155 | "vmovdqu " #i "+128(%%" #reg "), %%" #X8 "\n\t" \
|
---|
| 2156 | "vmovdqu " #i "+144(%%" #reg "), %%" #X9 "\n\t" \
|
---|
| 2157 | "vinserti128 $1, %%" #X8 ", %%" #Y0 ", %%" #Y0 "\n\t" \
|
---|
| 2158 | "vinserti128 $1, %%" #X9 ", %%" #Y1 ", %%" #Y1 "\n\t" \
|
---|
| 2159 | "vpshufb %%" #mask ", %%" #Y0 ", %%" #Y0 "\n\t" \
|
---|
| 2160 | "vpshufb %%" #mask ", %%" #Y1 ", %%" #Y1 "\n\t"
|
---|
| 2161 |
|
---|
| 2162 | #define LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) \
|
---|
| 2163 | _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i)
|
---|
| 2164 |
|
---|
| 2165 | #define LOAD_BLOCK2_W_Y(mask, reg) \
|
---|
| 2166 | LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, 0) \
|
---|
| 2167 | LOAD_BLOCK2_W_Y_2(mask, Y2, Y3, X2, X3, X8, X9, reg, 32) \
|
---|
| 2168 | LOAD_BLOCK2_W_Y_2(mask, Y4, Y5, X4, X5, X8, X9, reg, 64) \
|
---|
| 2169 | LOAD_BLOCK2_W_Y_2(mask, Y6, Y7, X6, X7, X8, X9, reg, 96) \
|
---|
| 2170 |
|
---|
| 2171 | #define SET_BLOCK2_W_Y(reg) \
|
---|
| 2172 | SET_W_Y_2(Y0, Y1, YTMP1, YTMP2, reg, 0) \
|
---|
| 2173 | SET_W_Y_2(Y2, Y3, YTMP1, YTMP2, reg, 64) \
|
---|
| 2174 | SET_W_Y_2(Y4, Y5, YTMP1, YTMP2, reg, 128) \
|
---|
| 2175 | SET_W_Y_2(Y6, Y7, YTMP1, YTMP2, reg, 192)
|
---|
| 2176 |
|
---|
| 2177 | static const word64 K512_AVX2[160] = {
|
---|
| 2178 | W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
|
---|
| 2179 | W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
|
---|
| 2180 | W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
|
---|
| 2181 | W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
|
---|
| 2182 | W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
|
---|
| 2183 | W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
|
---|
| 2184 | W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
|
---|
| 2185 | W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
|
---|
| 2186 | W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
|
---|
| 2187 | W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
|
---|
| 2188 | W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
|
---|
| 2189 | W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
|
---|
| 2190 | W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
|
---|
| 2191 | W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
|
---|
| 2192 | W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
|
---|
| 2193 | W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
|
---|
| 2194 | W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
|
---|
| 2195 | W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
|
---|
| 2196 | W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
|
---|
| 2197 | W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
|
---|
| 2198 | W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
|
---|
| 2199 | W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
|
---|
| 2200 | W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
|
---|
| 2201 | W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
|
---|
| 2202 | W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
|
---|
| 2203 | W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
|
---|
| 2204 | W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
|
---|
| 2205 | W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
|
---|
| 2206 | W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
|
---|
| 2207 | W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
|
---|
| 2208 | W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
|
---|
| 2209 | W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
|
---|
| 2210 | W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
|
---|
| 2211 | W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
|
---|
| 2212 | W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
|
---|
| 2213 | W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
|
---|
| 2214 | W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
|
---|
| 2215 | W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
|
---|
| 2216 | W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
|
---|
| 2217 | W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
|
---|
| 2218 | W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
|
---|
| 2219 | W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
|
---|
| 2220 | W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
|
---|
| 2221 | W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
|
---|
| 2222 | W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
|
---|
| 2223 | W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
|
---|
| 2224 | W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
|
---|
| 2225 | W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
|
---|
| 2226 | W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
|
---|
| 2227 | W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
|
---|
| 2228 | W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
|
---|
| 2229 | W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
|
---|
| 2230 | W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
|
---|
| 2231 | W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
|
---|
| 2232 | W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
|
---|
| 2233 | W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
|
---|
| 2234 | W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
|
---|
| 2235 | W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
|
---|
| 2236 | W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
|
---|
| 2237 | W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
|
---|
| 2238 | W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
|
---|
| 2239 | W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
|
---|
| 2240 | W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
|
---|
| 2241 | W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
|
---|
| 2242 | W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
|
---|
| 2243 | W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
|
---|
| 2244 | W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
|
---|
| 2245 | W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
|
---|
| 2246 | W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
|
---|
| 2247 | W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
|
---|
| 2248 | W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
|
---|
| 2249 | W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
|
---|
| 2250 | W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
|
---|
| 2251 | W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
|
---|
| 2252 | W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
|
---|
| 2253 | W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
|
---|
| 2254 | W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
|
---|
| 2255 | W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
|
---|
| 2256 | W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817),
|
---|
| 2257 | W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
|
---|
| 2258 | };
|
---|
| 2259 | static const word64* K512_AVX2_END = &K512_AVX2[128];
|
---|
| 2260 |
|
---|
| 2261 | static int Transform_Sha512_AVX2(wc_Sha512* sha512)
|
---|
[337] | 2262 | {
|
---|
[372] | 2263 | __asm__ __volatile__ (
|
---|
[337] | 2264 |
|
---|
[372] | 2265 | /* 16 Ws plus loop counter and K512. */
|
---|
| 2266 | "subq $136, %%rsp\n\t"
|
---|
| 2267 | "leaq 64(%[sha512]), %%rax\n\t"
|
---|
[337] | 2268 |
|
---|
[372] | 2269 | INIT_MASK(MASK_Y)
|
---|
| 2270 | LOAD_DIGEST()
|
---|
[337] | 2271 |
|
---|
[372] | 2272 | LOAD_BLOCK_W_Y(MASK_Y, rax)
|
---|
[337] | 2273 |
|
---|
[372] | 2274 | "movl $4, 16*8(" WX ")\n\t"
|
---|
| 2275 | "leaq %[K512], %%rsi\n\t"
|
---|
| 2276 | /* b */
|
---|
| 2277 | "movq %%r9, " L4 "\n\t"
|
---|
| 2278 | /* e */
|
---|
| 2279 | "movq %%r12, " L1 "\n\t"
|
---|
| 2280 | /* b ^ c */
|
---|
| 2281 | "xorq %%r10, " L4 "\n\t"
|
---|
[337] | 2282 |
|
---|
[372] | 2283 | SET_BLOCK_W_Y(rsi)
|
---|
| 2284 |
|
---|
| 2285 | "# Start of 16 rounds\n"
|
---|
| 2286 | "1:\n\t"
|
---|
| 2287 |
|
---|
| 2288 | "addq $128, %%rsi\n\t"
|
---|
| 2289 |
|
---|
| 2290 | MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 2291 | MsgSched4_AVX2(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4)
|
---|
| 2292 | MsgSched4_AVX2(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8)
|
---|
| 2293 | MsgSched4_AVX2(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12)
|
---|
| 2294 |
|
---|
| 2295 | SET_BLOCK_W_Y(rsi)
|
---|
| 2296 |
|
---|
| 2297 | "subl $1, 16*8(" WX ")\n\t"
|
---|
| 2298 | "jne 1b\n\t"
|
---|
| 2299 |
|
---|
| 2300 | RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 2301 | RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
|
---|
| 2302 | RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
|
---|
| 2303 | RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
|
---|
| 2304 |
|
---|
| 2305 | RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
|
---|
| 2306 | RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
|
---|
| 2307 | RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
|
---|
| 2308 | RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
|
---|
| 2309 |
|
---|
| 2310 | STORE_ADD_DIGEST()
|
---|
| 2311 |
|
---|
| 2312 | "addq $136, %%rsp\n\t"
|
---|
| 2313 |
|
---|
| 2314 | :
|
---|
| 2315 | : [mask] "m" (mBYTE_FLIP_MASK_Y),
|
---|
| 2316 | [sha512] "r" (sha512),
|
---|
| 2317 | [K512] "m" (K512)
|
---|
| 2318 | : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
|
---|
| 2319 | );
|
---|
| 2320 |
|
---|
| 2321 | return 0;
|
---|
[337] | 2322 | }
|
---|
| 2323 |
|
---|
[372] | 2324 | static int Transform_Sha512_AVX2_Len(wc_Sha512* sha512, word32 len)
|
---|
| 2325 | {
|
---|
| 2326 | if ((len & WC_SHA512_BLOCK_SIZE) != 0) {
|
---|
| 2327 | XMEMCPY(sha512->buffer, sha512->data, WC_SHA512_BLOCK_SIZE);
|
---|
| 2328 | Transform_Sha512_AVX2(sha512);
|
---|
| 2329 | sha512->data += WC_SHA512_BLOCK_SIZE;
|
---|
| 2330 | len -= WC_SHA512_BLOCK_SIZE;
|
---|
| 2331 | if (len == 0)
|
---|
| 2332 | return 0;
|
---|
| 2333 | }
|
---|
[337] | 2334 |
|
---|
[372] | 2335 | __asm__ __volatile__ (
|
---|
[337] | 2336 |
|
---|
[372] | 2337 | "movq 224(%[sha512]), %%rcx\n\t"
|
---|
| 2338 |
|
---|
| 2339 | INIT_MASK(MASK_Y)
|
---|
| 2340 | LOAD_DIGEST()
|
---|
| 2341 |
|
---|
| 2342 | "# Start of processing two blocks\n"
|
---|
| 2343 | "2:\n\t"
|
---|
| 2344 |
|
---|
| 2345 | "subq $1344, %%rsp\n\t"
|
---|
| 2346 | "leaq %[K512], %%rsi\n\t"
|
---|
| 2347 |
|
---|
| 2348 | /* L4 = b */
|
---|
| 2349 | "movq %%r9, " L4 "\n\t"
|
---|
| 2350 | /* e */
|
---|
| 2351 | "movq %%r12, " L1 "\n\t"
|
---|
| 2352 |
|
---|
| 2353 | LOAD_BLOCK2_W_Y(MASK_Y, rcx)
|
---|
| 2354 |
|
---|
| 2355 | /* L4 = b ^ c */
|
---|
| 2356 | "xorq %%r10, " L4 "\n\t"
|
---|
| 2357 | "\n"
|
---|
| 2358 | "1:\n\t"
|
---|
| 2359 | SET_BLOCK2_W_Y(rsi)
|
---|
| 2360 | MsgSched2_AVX2(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 2361 | MsgSched2_AVX2(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4)
|
---|
| 2362 | MsgSched2_AVX2(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8)
|
---|
| 2363 | MsgSched2_AVX2(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12)
|
---|
| 2364 | MsgSched2_AVX2(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16)
|
---|
| 2365 | MsgSched2_AVX2(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20)
|
---|
| 2366 | MsgSched2_AVX2(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24)
|
---|
| 2367 | MsgSched2_AVX2(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28)
|
---|
| 2368 | "addq $256, %%rsi\n\t"
|
---|
| 2369 | "addq $256, %%rsp\n\t"
|
---|
| 2370 | "cmpq %[K512_END], %%rsi\n\t"
|
---|
| 2371 | "jne 1b\n\t"
|
---|
| 2372 |
|
---|
| 2373 | SET_BLOCK2_W_Y(rsi)
|
---|
| 2374 | RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 2375 | RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4)
|
---|
| 2376 | RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8)
|
---|
| 2377 | RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12)
|
---|
| 2378 |
|
---|
| 2379 | RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16)
|
---|
| 2380 | RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20)
|
---|
| 2381 | RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24)
|
---|
| 2382 | RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28)
|
---|
| 2383 | "subq $1024, %%rsp\n\t"
|
---|
| 2384 |
|
---|
| 2385 | ADD_DIGEST()
|
---|
| 2386 | STORE_DIGEST()
|
---|
| 2387 |
|
---|
| 2388 | /* L4 = b */
|
---|
| 2389 | "movq %%r9, " L4 "\n\t"
|
---|
| 2390 | /* e */
|
---|
| 2391 | "movq %%r12, " L1 "\n\t"
|
---|
| 2392 | /* L4 = b ^ c */
|
---|
| 2393 | "xorq %%r10, " L4 "\n\t"
|
---|
| 2394 |
|
---|
| 2395 | "movq $5, %%rsi\n\t"
|
---|
| 2396 | "\n"
|
---|
| 2397 | "3:\n\t"
|
---|
| 2398 | RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2)
|
---|
| 2399 | RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6)
|
---|
| 2400 | RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10)
|
---|
| 2401 | RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
|
---|
| 2402 |
|
---|
| 2403 | RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18)
|
---|
| 2404 | RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22)
|
---|
| 2405 | RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26)
|
---|
| 2406 | RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30)
|
---|
| 2407 | "addq $256, %%rsp\n\t"
|
---|
| 2408 | "subq $1, %%rsi\n\t"
|
---|
| 2409 | "jnz 3b\n\t"
|
---|
| 2410 |
|
---|
| 2411 | ADD_DIGEST()
|
---|
| 2412 |
|
---|
| 2413 | "movq 224(%[sha512]), %%rcx\n\t"
|
---|
| 2414 | "addq $64, %%rsp\n\t"
|
---|
| 2415 | "addq $256, %%rcx\n\t"
|
---|
| 2416 | "subl $256, %[len]\n\t"
|
---|
| 2417 | "movq %%rcx, 224(%[sha512])\n\t"
|
---|
| 2418 |
|
---|
| 2419 | STORE_DIGEST()
|
---|
| 2420 |
|
---|
| 2421 | "jnz 2b\n\t"
|
---|
| 2422 |
|
---|
| 2423 | :
|
---|
| 2424 | : [mask] "m" (mBYTE_FLIP_MASK_Y),
|
---|
| 2425 | [len] "m" (len),
|
---|
| 2426 | [sha512] "r" (sha512),
|
---|
| 2427 | [K512] "m" (K512_AVX2),
|
---|
| 2428 | [K512_END] "m" (K512_AVX2_END)
|
---|
| 2429 | : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
|
---|
| 2430 | );
|
---|
| 2431 |
|
---|
[337] | 2432 | return 0;
|
---|
| 2433 | }
|
---|
[372] | 2434 |
|
---|
| 2435 | #ifdef HAVE_INTEL_RORX
|
---|
| 2436 | static int Transform_Sha512_AVX2_RORX(wc_Sha512* sha512)
|
---|
| 2437 | {
|
---|
| 2438 | __asm__ __volatile__ (
|
---|
| 2439 |
|
---|
| 2440 | /* 16 Ws plus loop counter. */
|
---|
| 2441 | "subq $136, %%rsp\n\t"
|
---|
| 2442 | "leaq 64(%[sha512]), " L2 "\n\t"
|
---|
| 2443 |
|
---|
| 2444 | INIT_MASK(MASK_Y)
|
---|
| 2445 | LOAD_DIGEST()
|
---|
| 2446 |
|
---|
| 2447 | LOAD_BLOCK_W_Y(MASK_Y, rcx)
|
---|
| 2448 |
|
---|
| 2449 | "movl $4, 16*8(" WX ")\n\t"
|
---|
| 2450 | "leaq %[K512], %%rsi\n\t"
|
---|
| 2451 | /* b */
|
---|
| 2452 | "movq %%r9, " L4 "\n\t"
|
---|
| 2453 | /* L3 = 0 (add to prev h) */
|
---|
| 2454 | "xorq " L3 ", " L3 "\n\t"
|
---|
| 2455 | /* b ^ c */
|
---|
| 2456 | "xorq %%r10, " L4 "\n\t"
|
---|
| 2457 |
|
---|
| 2458 | SET_BLOCK_W_Y(rsi)
|
---|
| 2459 |
|
---|
| 2460 | "# Start of 16 rounds\n"
|
---|
| 2461 | "1:\n\t"
|
---|
| 2462 |
|
---|
| 2463 | "addq $128, %%rsi\n\t"
|
---|
| 2464 |
|
---|
| 2465 | MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 2466 | MsgSched4_AVX2_RORX_SET(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4)
|
---|
| 2467 | MsgSched4_AVX2_RORX_SET(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8)
|
---|
| 2468 | MsgSched4_AVX2_RORX_SET(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12)
|
---|
| 2469 |
|
---|
| 2470 | "subl $1, 16*8(%%rsp)\n\t"
|
---|
| 2471 | "jnz 1b\n\t"
|
---|
| 2472 |
|
---|
| 2473 | RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 2474 | RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD, 4)
|
---|
| 2475 | RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 8)
|
---|
| 2476 | RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD,12)
|
---|
| 2477 | /* Prev RND: h += Maj(a,b,c) */
|
---|
| 2478 | "addq " L3 ", %%r8\n\t"
|
---|
| 2479 | "addq $136, %%rsp\n\t"
|
---|
| 2480 |
|
---|
| 2481 | STORE_ADD_DIGEST()
|
---|
| 2482 |
|
---|
| 2483 | :
|
---|
| 2484 | : [mask] "m" (mBYTE_FLIP_MASK_Y),
|
---|
| 2485 | [sha512] "r" (sha512),
|
---|
| 2486 | [K512] "m" (K512)
|
---|
| 2487 | : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
|
---|
| 2488 | );
|
---|
| 2489 |
|
---|
| 2490 | return 0;
|
---|
| 2491 | }
|
---|
| 2492 |
|
---|
| 2493 | static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512* sha512, word32 len)
|
---|
| 2494 | {
|
---|
| 2495 | if ((len & WC_SHA512_BLOCK_SIZE) != 0) {
|
---|
| 2496 | XMEMCPY(sha512->buffer, sha512->data, WC_SHA512_BLOCK_SIZE);
|
---|
| 2497 | Transform_Sha512_AVX2_RORX(sha512);
|
---|
| 2498 | sha512->data += WC_SHA512_BLOCK_SIZE;
|
---|
| 2499 | len -= WC_SHA512_BLOCK_SIZE;
|
---|
| 2500 | if (len == 0)
|
---|
| 2501 | return 0;
|
---|
| 2502 | }
|
---|
| 2503 |
|
---|
| 2504 | __asm__ __volatile__ (
|
---|
| 2505 |
|
---|
| 2506 | "movq 224(%[sha512]), %%rax\n\t"
|
---|
| 2507 |
|
---|
| 2508 | INIT_MASK(MASK_Y)
|
---|
| 2509 | LOAD_DIGEST()
|
---|
| 2510 |
|
---|
| 2511 | "# Start of processing two blocks\n"
|
---|
| 2512 | "2:\n\t"
|
---|
| 2513 |
|
---|
| 2514 | "subq $1344, %%rsp\n\t"
|
---|
| 2515 | "leaq %[K512], %%rsi\n\t"
|
---|
| 2516 |
|
---|
| 2517 | /* L4 = b */
|
---|
| 2518 | "movq %%r9, " L4 "\n\t"
|
---|
| 2519 | /* L3 = 0 (add to prev h) */
|
---|
| 2520 | "xorq " L3 ", " L3 "\n\t"
|
---|
| 2521 |
|
---|
| 2522 | LOAD_BLOCK2_W_Y(MASK_Y, rax)
|
---|
| 2523 |
|
---|
| 2524 | /* L4 = b ^ c */
|
---|
| 2525 | "xorq %%r10, " L4 "\n\t"
|
---|
| 2526 | "\n"
|
---|
| 2527 | "1:\n\t"
|
---|
| 2528 | SET_BLOCK2_W_Y(rsi)
|
---|
| 2529 | MsgSched2_AVX2_RORX(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 2530 | MsgSched2_AVX2_RORX(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4)
|
---|
| 2531 | MsgSched2_AVX2_RORX(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8)
|
---|
| 2532 | MsgSched2_AVX2_RORX(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12)
|
---|
| 2533 | MsgSched2_AVX2_RORX(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16)
|
---|
| 2534 | MsgSched2_AVX2_RORX(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20)
|
---|
| 2535 | MsgSched2_AVX2_RORX(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24)
|
---|
| 2536 | MsgSched2_AVX2_RORX(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28)
|
---|
| 2537 | "addq $256, %%rsi\n\t"
|
---|
| 2538 | "addq $256, %%rsp\n\t"
|
---|
| 2539 | "cmpq %[K512_END], %%rsi\n\t"
|
---|
| 2540 | "jne 1b\n\t"
|
---|
| 2541 |
|
---|
| 2542 | SET_BLOCK2_W_Y(rsi)
|
---|
| 2543 | RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
|
---|
| 2544 | RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4)
|
---|
| 2545 | RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8)
|
---|
| 2546 | RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12)
|
---|
| 2547 |
|
---|
| 2548 | RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16)
|
---|
| 2549 | RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20)
|
---|
| 2550 | RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24)
|
---|
| 2551 | RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28)
|
---|
| 2552 | "addq " L3 ", %%r8\n\t"
|
---|
| 2553 | "subq $1024, %%rsp\n\t"
|
---|
| 2554 |
|
---|
| 2555 | ADD_DIGEST()
|
---|
| 2556 | STORE_DIGEST()
|
---|
| 2557 |
|
---|
| 2558 | /* L4 = b */
|
---|
| 2559 | "movq %%r9, " L4 "\n\t"
|
---|
| 2560 | /* L3 = 0 (add to prev h) */
|
---|
| 2561 | "xorq " L3 ", " L3 "\n\t"
|
---|
| 2562 | /* L4 = b ^ c */
|
---|
| 2563 | "xorq %%r10, " L4 "\n\t"
|
---|
| 2564 |
|
---|
| 2565 | "movq $5, %%rsi\n\t"
|
---|
| 2566 | "\n"
|
---|
| 2567 | "3:\n\t"
|
---|
| 2568 | RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2)
|
---|
| 2569 | RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6)
|
---|
| 2570 | RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10)
|
---|
| 2571 | RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
|
---|
| 2572 |
|
---|
| 2573 | RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18)
|
---|
| 2574 | RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22)
|
---|
| 2575 | RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26)
|
---|
| 2576 | RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30)
|
---|
| 2577 | "addq $256, %%rsp\n\t"
|
---|
| 2578 | "subq $1, %%rsi\n\t"
|
---|
| 2579 | "jnz 3b\n\t"
|
---|
| 2580 |
|
---|
| 2581 | "addq " L3 ", %%r8\n\t"
|
---|
| 2582 |
|
---|
| 2583 | ADD_DIGEST()
|
---|
| 2584 |
|
---|
| 2585 | "movq 224(%[sha512]), %%rax\n\t"
|
---|
| 2586 | "addq $64, %%rsp\n\t"
|
---|
| 2587 | "addq $256, %%rax\n\t"
|
---|
| 2588 | "subl $256, %[len]\n\t"
|
---|
| 2589 | "movq %%rax, 224(%[sha512])\n\t"
|
---|
| 2590 |
|
---|
| 2591 | STORE_DIGEST()
|
---|
| 2592 |
|
---|
| 2593 | "jnz 2b\n\t"
|
---|
| 2594 |
|
---|
| 2595 | :
|
---|
| 2596 | : [mask] "m" (mBYTE_FLIP_MASK_Y),
|
---|
| 2597 | [len] "m" (len),
|
---|
| 2598 | [sha512] "r" (sha512),
|
---|
| 2599 | [K512] "m" (K512_AVX2),
|
---|
| 2600 | [K512_END] "m" (K512_AVX2_END)
|
---|
| 2601 | : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
|
---|
| 2602 | );
|
---|
| 2603 |
|
---|
| 2604 | return 0;
|
---|
| 2605 | }
|
---|
| 2606 | #endif /* HAVE_INTEL_RORX */
|
---|
[337] | 2607 | #endif /* HAVE_INTEL_AVX2 */
|
---|
| 2608 |
|
---|
[372] | 2609 | #endif /* WOLFSSL_SHA512 */
|
---|
[337] | 2610 |
|
---|
| 2611 |
|
---|
| 2612 | /* -------------------------------------------------------------------------- */
|
---|
| 2613 | /* SHA384 */
|
---|
| 2614 | /* -------------------------------------------------------------------------- */
|
---|
| 2615 | #ifdef WOLFSSL_SHA384
|
---|
[372] | 2616 |
|
---|
| 2617 | #if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH)
|
---|
| 2618 | /* functions defined in wolfcrypt/src/port/caam/caam_sha.c */
|
---|
| 2619 | #else
|
---|
| 2620 |
|
---|
[337] | 2621 | static int InitSha384(wc_Sha384* sha384)
|
---|
| 2622 | {
|
---|
| 2623 | if (sha384 == NULL) {
|
---|
| 2624 | return BAD_FUNC_ARG;
|
---|
| 2625 | }
|
---|
| 2626 |
|
---|
| 2627 | sha384->digest[0] = W64LIT(0xcbbb9d5dc1059ed8);
|
---|
| 2628 | sha384->digest[1] = W64LIT(0x629a292a367cd507);
|
---|
| 2629 | sha384->digest[2] = W64LIT(0x9159015a3070dd17);
|
---|
| 2630 | sha384->digest[3] = W64LIT(0x152fecd8f70e5939);
|
---|
| 2631 | sha384->digest[4] = W64LIT(0x67332667ffc00b31);
|
---|
| 2632 | sha384->digest[5] = W64LIT(0x8eb44a8768581511);
|
---|
| 2633 | sha384->digest[6] = W64LIT(0xdb0c2e0d64f98fa7);
|
---|
| 2634 | sha384->digest[7] = W64LIT(0x47b5481dbefa4fa4);
|
---|
| 2635 |
|
---|
| 2636 | sha384->buffLen = 0;
|
---|
| 2637 | sha384->loLen = 0;
|
---|
| 2638 | sha384->hiLen = 0;
|
---|
| 2639 |
|
---|
| 2640 | return 0;
|
---|
| 2641 | }
|
---|
| 2642 |
|
---|
| 2643 | int wc_Sha384Update(wc_Sha384* sha384, const byte* data, word32 len)
|
---|
| 2644 | {
|
---|
| 2645 | if (sha384 == NULL || (data == NULL && len > 0)) {
|
---|
| 2646 | return BAD_FUNC_ARG;
|
---|
| 2647 | }
|
---|
| 2648 |
|
---|
| 2649 | #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
|
---|
| 2650 | if (sha384->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA384) {
|
---|
| 2651 | #if defined(HAVE_INTEL_QA)
|
---|
| 2652 | return IntelQaSymSha384(&sha384->asyncDev, NULL, data, len);
|
---|
| 2653 | #endif
|
---|
| 2654 | }
|
---|
| 2655 | #endif /* WOLFSSL_ASYNC_CRYPT */
|
---|
| 2656 |
|
---|
| 2657 | return Sha512Update((wc_Sha512*)sha384, data, len);
|
---|
| 2658 | }
|
---|
| 2659 |
|
---|
| 2660 |
|
---|
[372] | 2661 | int wc_Sha384FinalRaw(wc_Sha384* sha384, byte* hash)
|
---|
| 2662 | {
|
---|
| 2663 | #ifdef LITTLE_ENDIAN_ORDER
|
---|
| 2664 | word64 digest[WC_SHA384_DIGEST_SIZE / sizeof(word64)];
|
---|
| 2665 | #endif
|
---|
| 2666 |
|
---|
| 2667 | if (sha384 == NULL || hash == NULL) {
|
---|
| 2668 | return BAD_FUNC_ARG;
|
---|
| 2669 | }
|
---|
| 2670 |
|
---|
| 2671 | #ifdef LITTLE_ENDIAN_ORDER
|
---|
| 2672 | ByteReverseWords64((word64*)digest, (word64*)sha384->digest,
|
---|
| 2673 | WC_SHA384_DIGEST_SIZE);
|
---|
| 2674 | XMEMCPY(hash, digest, WC_SHA384_DIGEST_SIZE);
|
---|
| 2675 | #else
|
---|
| 2676 | XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
|
---|
| 2677 | #endif
|
---|
| 2678 |
|
---|
| 2679 | return 0;
|
---|
| 2680 | }
|
---|
| 2681 |
|
---|
[337] | 2682 | int wc_Sha384Final(wc_Sha384* sha384, byte* hash)
|
---|
| 2683 | {
|
---|
| 2684 | int ret;
|
---|
| 2685 |
|
---|
| 2686 | if (sha384 == NULL || hash == NULL) {
|
---|
| 2687 | return BAD_FUNC_ARG;
|
---|
| 2688 | }
|
---|
| 2689 |
|
---|
| 2690 | #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
|
---|
| 2691 | if (sha384->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA384) {
|
---|
| 2692 | #if defined(HAVE_INTEL_QA)
|
---|
| 2693 | return IntelQaSymSha384(&sha384->asyncDev, hash, NULL,
|
---|
| 2694 | WC_SHA384_DIGEST_SIZE);
|
---|
| 2695 | #endif
|
---|
| 2696 | }
|
---|
| 2697 | #endif /* WOLFSSL_ASYNC_CRYPT */
|
---|
| 2698 |
|
---|
| 2699 | ret = Sha512Final((wc_Sha512*)sha384);
|
---|
| 2700 | if (ret != 0)
|
---|
| 2701 | return ret;
|
---|
| 2702 |
|
---|
| 2703 | XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
|
---|
| 2704 |
|
---|
| 2705 | return InitSha384(sha384); /* reset state */
|
---|
| 2706 | }
|
---|
| 2707 |
|
---|
| 2708 | int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId)
|
---|
| 2709 | {
|
---|
| 2710 | int ret;
|
---|
| 2711 |
|
---|
| 2712 | if (sha384 == NULL) {
|
---|
| 2713 | return BAD_FUNC_ARG;
|
---|
| 2714 | }
|
---|
| 2715 |
|
---|
| 2716 | sha384->heap = heap;
|
---|
| 2717 | ret = InitSha384(sha384);
|
---|
| 2718 | if (ret != 0)
|
---|
| 2719 | return ret;
|
---|
| 2720 |
|
---|
[372] | 2721 | #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
|
---|
| 2722 | Sha512_SetTransform();
|
---|
| 2723 | #endif
|
---|
| 2724 | #ifdef WOLFSSL_SMALL_STACK_CACHE
|
---|
| 2725 | sha384->W = NULL;
|
---|
| 2726 | #endif
|
---|
| 2727 |
|
---|
[337] | 2728 | #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
|
---|
| 2729 | ret = wolfAsync_DevCtxInit(&sha384->asyncDev, WOLFSSL_ASYNC_MARKER_SHA384,
|
---|
| 2730 | sha384->heap, devId);
|
---|
| 2731 | #else
|
---|
| 2732 | (void)devId;
|
---|
| 2733 | #endif /* WOLFSSL_ASYNC_CRYPT */
|
---|
| 2734 |
|
---|
| 2735 | return ret;
|
---|
| 2736 | }
|
---|
| 2737 |
|
---|
[372] | 2738 | #endif /* WOLFSSL_IMX6_CAAM */
|
---|
| 2739 |
|
---|
[337] | 2740 | int wc_InitSha384(wc_Sha384* sha384)
|
---|
| 2741 | {
|
---|
| 2742 | return wc_InitSha384_ex(sha384, NULL, INVALID_DEVID);
|
---|
| 2743 | }
|
---|
| 2744 |
|
---|
| 2745 | void wc_Sha384Free(wc_Sha384* sha384)
|
---|
| 2746 | {
|
---|
| 2747 | if (sha384 == NULL)
|
---|
| 2748 | return;
|
---|
| 2749 |
|
---|
[372] | 2750 | #ifdef WOLFSSL_SMALL_STACK_CACHE
|
---|
| 2751 | if (sha384->W != NULL) {
|
---|
| 2752 | XFREE(sha384->W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
|
---|
| 2753 | sha384->W = NULL;
|
---|
| 2754 | }
|
---|
| 2755 | #endif
|
---|
| 2756 |
|
---|
[337] | 2757 | #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
|
---|
| 2758 | wolfAsync_DevCtxFree(&sha384->asyncDev, WOLFSSL_ASYNC_MARKER_SHA384);
|
---|
| 2759 | #endif /* WOLFSSL_ASYNC_CRYPT */
|
---|
| 2760 | }
|
---|
| 2761 |
|
---|
| 2762 | #endif /* WOLFSSL_SHA384 */
|
---|
| 2763 |
|
---|
| 2764 | #endif /* HAVE_FIPS */
|
---|
| 2765 |
|
---|
[372] | 2766 | #ifdef WOLFSSL_SHA512
|
---|
[337] | 2767 |
|
---|
| 2768 | int wc_Sha512GetHash(wc_Sha512* sha512, byte* hash)
|
---|
| 2769 | {
|
---|
| 2770 | int ret;
|
---|
| 2771 | wc_Sha512 tmpSha512;
|
---|
| 2772 |
|
---|
| 2773 | if (sha512 == NULL || hash == NULL)
|
---|
| 2774 | return BAD_FUNC_ARG;
|
---|
| 2775 |
|
---|
| 2776 | ret = wc_Sha512Copy(sha512, &tmpSha512);
|
---|
| 2777 | if (ret == 0) {
|
---|
| 2778 | ret = wc_Sha512Final(&tmpSha512, hash);
|
---|
[372] | 2779 | wc_Sha512Free(&tmpSha512);
|
---|
[337] | 2780 | }
|
---|
| 2781 | return ret;
|
---|
| 2782 | }
|
---|
| 2783 |
|
---|
| 2784 | int wc_Sha512Copy(wc_Sha512* src, wc_Sha512* dst)
|
---|
| 2785 | {
|
---|
| 2786 | int ret = 0;
|
---|
| 2787 |
|
---|
| 2788 | if (src == NULL || dst == NULL)
|
---|
| 2789 | return BAD_FUNC_ARG;
|
---|
| 2790 |
|
---|
| 2791 | XMEMCPY(dst, src, sizeof(wc_Sha512));
|
---|
[372] | 2792 | #ifdef WOLFSSL_SMALL_STACK_CACHE
|
---|
| 2793 | dst->W = NULL;
|
---|
| 2794 | #endif
|
---|
[337] | 2795 |
|
---|
| 2796 | #ifdef WOLFSSL_ASYNC_CRYPT
|
---|
| 2797 | ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
|
---|
| 2798 | #endif
|
---|
| 2799 |
|
---|
| 2800 | return ret;
|
---|
| 2801 | }
|
---|
| 2802 |
|
---|
[372] | 2803 | #endif /* WOLFSSL_SHA512 */
|
---|
| 2804 |
|
---|
[337] | 2805 | #ifdef WOLFSSL_SHA384
|
---|
[372] | 2806 |
|
---|
[337] | 2807 | int wc_Sha384GetHash(wc_Sha384* sha384, byte* hash)
|
---|
| 2808 | {
|
---|
| 2809 | int ret;
|
---|
| 2810 | wc_Sha384 tmpSha384;
|
---|
| 2811 |
|
---|
| 2812 | if (sha384 == NULL || hash == NULL)
|
---|
| 2813 | return BAD_FUNC_ARG;
|
---|
| 2814 |
|
---|
| 2815 | ret = wc_Sha384Copy(sha384, &tmpSha384);
|
---|
| 2816 | if (ret == 0) {
|
---|
| 2817 | ret = wc_Sha384Final(&tmpSha384, hash);
|
---|
[372] | 2818 | wc_Sha384Free(&tmpSha384);
|
---|
[337] | 2819 | }
|
---|
| 2820 | return ret;
|
---|
| 2821 | }
|
---|
| 2822 | int wc_Sha384Copy(wc_Sha384* src, wc_Sha384* dst)
|
---|
| 2823 | {
|
---|
| 2824 | int ret = 0;
|
---|
| 2825 |
|
---|
| 2826 | if (src == NULL || dst == NULL)
|
---|
| 2827 | return BAD_FUNC_ARG;
|
---|
| 2828 |
|
---|
| 2829 | XMEMCPY(dst, src, sizeof(wc_Sha384));
|
---|
[372] | 2830 | #ifdef WOLFSSL_SMALL_STACK_CACHE
|
---|
| 2831 | dst->W = NULL;
|
---|
| 2832 | #endif
|
---|
[337] | 2833 |
|
---|
| 2834 | #ifdef WOLFSSL_ASYNC_CRYPT
|
---|
| 2835 | ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
|
---|
| 2836 | #endif
|
---|
| 2837 |
|
---|
| 2838 | return ret;
|
---|
| 2839 | }
|
---|
[372] | 2840 |
|
---|
[337] | 2841 | #endif /* WOLFSSL_SHA384 */
|
---|
| 2842 |
|
---|
[372] | 2843 | #endif /* WOLFSSL_SHA512 || WOLFSSL_SHA384 */
|
---|