/ src / crypto / sha256_sse4.cpp
sha256_sse4.cpp
   1  // Copyright (c) 2017-present The Bitcoin Core developers
   2  // Distributed under the MIT software license, see the accompanying
   3  // file COPYING or http://www.opensource.org/licenses/mit-license.php.
   4  //
   5  // This is a translation to GCC extended asm syntax from YASM code by Intel
   6  // (available at the bottom of this file).
   7  
   8  #if defined(__x86_64__) || defined(__amd64__)
   9  
  10  #include <cstdint>
  11  #include <cstdlib>
  12  
  13  namespace sha256_sse4
  14  {
  15  void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
  16  #if defined(__clang__)
  17    /*
  18    clang is unable to compile this with -O0 and -fsanitize=address.
  19    See upstream bug: https://github.com/llvm/llvm-project/issues/92182.
  20    This also fails to compile with -O2, -fcf-protection & -fsanitize=address.
  21    See https://github.com/bitcoin/bitcoin/issues/31913.
  22    */
  23  #if __has_feature(address_sanitizer)
  24    __attribute__((no_sanitize("address")))
  25  #endif
  26  #endif
  27  {
  28      static const uint32_t K256 alignas(16) [] = {
  29          0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
  30          0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
  31          0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
  32          0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
  33          0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
  34          0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
  35          0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
  36          0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
  37          0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
  38          0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
  39          0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
  40          0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
  41          0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
  42          0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
  43          0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  44          0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
  45      };
  46      static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
  47      static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
  48      static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
  49      uint32_t a, b, c, d, f, g, h, y0, y1, y2;
  50      uint64_t tbl;
  51      uint64_t inp_end, inp;
  52      uint32_t xfer alignas(16) [4];
  53  
  54      __asm__ __volatile__(
  55          "shl    $0x6,%2;"
  56          "je     Ldone_hash_%=;"
  57          "add    %1,%2;"
  58          "mov    %2,%14;"
  59          "mov    (%0),%3;"
  60          "mov    0x4(%0),%4;"
  61          "mov    0x8(%0),%5;"
  62          "mov    0xc(%0),%6;"
  63          "mov    0x10(%0),%k2;"
  64          "mov    0x14(%0),%7;"
  65          "mov    0x18(%0),%8;"
  66          "mov    0x1c(%0),%9;"
  67          "movdqa %18,%%xmm12;"
  68          "movdqa %19,%%xmm10;"
  69          "movdqa %20,%%xmm11;"
  70  
  71          "Lloop0_%=:"
  72          "lea    %17,%13;"
  73          "movdqu (%1),%%xmm4;"
  74          "pshufb %%xmm12,%%xmm4;"
  75          "movdqu 0x10(%1),%%xmm5;"
  76          "pshufb %%xmm12,%%xmm5;"
  77          "movdqu 0x20(%1),%%xmm6;"
  78          "pshufb %%xmm12,%%xmm6;"
  79          "movdqu 0x30(%1),%%xmm7;"
  80          "pshufb %%xmm12,%%xmm7;"
  81          "mov    %1,%15;"
  82          "mov    $3,%1;"
  83  
  84          "Lloop1_%=:"
  85          "movdqa 0x0(%13),%%xmm9;"
  86          "paddd  %%xmm4,%%xmm9;"
  87          "movdqa %%xmm9,%16;"
  88          "movdqa %%xmm7,%%xmm0;"
  89          "mov    %k2,%10;"
  90          "ror    $0xe,%10;"
  91          "mov    %3,%11;"
  92          "palignr $0x4,%%xmm6,%%xmm0;"
  93          "ror    $0x9,%11;"
  94          "xor    %k2,%10;"
  95          "mov    %7,%12;"
  96          "ror    $0x5,%10;"
  97          "movdqa %%xmm5,%%xmm1;"
  98          "xor    %3,%11;"
  99          "xor    %8,%12;"
 100          "paddd  %%xmm4,%%xmm0;"
 101          "xor    %k2,%10;"
 102          "and    %k2,%12;"
 103          "ror    $0xb,%11;"
 104          "palignr $0x4,%%xmm4,%%xmm1;"
 105          "xor    %3,%11;"
 106          "ror    $0x6,%10;"
 107          "xor    %8,%12;"
 108          "movdqa %%xmm1,%%xmm2;"
 109          "ror    $0x2,%11;"
 110          "add    %10,%12;"
 111          "add    %16,%12;"
 112          "movdqa %%xmm1,%%xmm3;"
 113          "mov    %3,%10;"
 114          "add    %12,%9;"
 115          "mov    %3,%12;"
 116          "pslld  $0x19,%%xmm1;"
 117          "or     %5,%10;"
 118          "add    %9,%6;"
 119          "and    %5,%12;"
 120          "psrld  $0x7,%%xmm2;"
 121          "and    %4,%10;"
 122          "add    %11,%9;"
 123          "por    %%xmm2,%%xmm1;"
 124          "or     %12,%10;"
 125          "add    %10,%9;"
 126          "movdqa %%xmm3,%%xmm2;"
 127          "mov    %6,%10;"
 128          "mov    %9,%11;"
 129          "movdqa %%xmm3,%%xmm8;"
 130          "ror    $0xe,%10;"
 131          "xor    %6,%10;"
 132          "mov    %k2,%12;"
 133          "ror    $0x9,%11;"
 134          "pslld  $0xe,%%xmm3;"
 135          "xor    %9,%11;"
 136          "ror    $0x5,%10;"
 137          "xor    %7,%12;"
 138          "psrld  $0x12,%%xmm2;"
 139          "ror    $0xb,%11;"
 140          "xor    %6,%10;"
 141          "and    %6,%12;"
 142          "ror    $0x6,%10;"
 143          "pxor   %%xmm3,%%xmm1;"
 144          "xor    %9,%11;"
 145          "xor    %7,%12;"
 146          "psrld  $0x3,%%xmm8;"
 147          "add    %10,%12;"
 148          "add    4+%16,%12;"
 149          "ror    $0x2,%11;"
 150          "pxor   %%xmm2,%%xmm1;"
 151          "mov    %9,%10;"
 152          "add    %12,%8;"
 153          "mov    %9,%12;"
 154          "pxor   %%xmm8,%%xmm1;"
 155          "or     %4,%10;"
 156          "add    %8,%5;"
 157          "and    %4,%12;"
 158          "pshufd $0xfa,%%xmm7,%%xmm2;"
 159          "and    %3,%10;"
 160          "add    %11,%8;"
 161          "paddd  %%xmm1,%%xmm0;"
 162          "or     %12,%10;"
 163          "add    %10,%8;"
 164          "movdqa %%xmm2,%%xmm3;"
 165          "mov    %5,%10;"
 166          "mov    %8,%11;"
 167          "ror    $0xe,%10;"
 168          "movdqa %%xmm2,%%xmm8;"
 169          "xor    %5,%10;"
 170          "ror    $0x9,%11;"
 171          "mov    %6,%12;"
 172          "xor    %8,%11;"
 173          "ror    $0x5,%10;"
 174          "psrlq  $0x11,%%xmm2;"
 175          "xor    %k2,%12;"
 176          "psrlq  $0x13,%%xmm3;"
 177          "xor    %5,%10;"
 178          "and    %5,%12;"
 179          "psrld  $0xa,%%xmm8;"
 180          "ror    $0xb,%11;"
 181          "xor    %8,%11;"
 182          "xor    %k2,%12;"
 183          "ror    $0x6,%10;"
 184          "pxor   %%xmm3,%%xmm2;"
 185          "add    %10,%12;"
 186          "ror    $0x2,%11;"
 187          "add    8+%16,%12;"
 188          "pxor   %%xmm2,%%xmm8;"
 189          "mov    %8,%10;"
 190          "add    %12,%7;"
 191          "mov    %8,%12;"
 192          "pshufb %%xmm10,%%xmm8;"
 193          "or     %3,%10;"
 194          "add    %7,%4;"
 195          "and    %3,%12;"
 196          "paddd  %%xmm8,%%xmm0;"
 197          "and    %9,%10;"
 198          "add    %11,%7;"
 199          "pshufd $0x50,%%xmm0,%%xmm2;"
 200          "or     %12,%10;"
 201          "add    %10,%7;"
 202          "movdqa %%xmm2,%%xmm3;"
 203          "mov    %4,%10;"
 204          "ror    $0xe,%10;"
 205          "mov    %7,%11;"
 206          "movdqa %%xmm2,%%xmm4;"
 207          "ror    $0x9,%11;"
 208          "xor    %4,%10;"
 209          "mov    %5,%12;"
 210          "ror    $0x5,%10;"
 211          "psrlq  $0x11,%%xmm2;"
 212          "xor    %7,%11;"
 213          "xor    %6,%12;"
 214          "psrlq  $0x13,%%xmm3;"
 215          "xor    %4,%10;"
 216          "and    %4,%12;"
 217          "ror    $0xb,%11;"
 218          "psrld  $0xa,%%xmm4;"
 219          "xor    %7,%11;"
 220          "ror    $0x6,%10;"
 221          "xor    %6,%12;"
 222          "pxor   %%xmm3,%%xmm2;"
 223          "ror    $0x2,%11;"
 224          "add    %10,%12;"
 225          "add    12+%16,%12;"
 226          "pxor   %%xmm2,%%xmm4;"
 227          "mov    %7,%10;"
 228          "add    %12,%k2;"
 229          "mov    %7,%12;"
 230          "pshufb %%xmm11,%%xmm4;"
 231          "or     %9,%10;"
 232          "add    %k2,%3;"
 233          "and    %9,%12;"
 234          "paddd  %%xmm0,%%xmm4;"
 235          "and    %8,%10;"
 236          "add    %11,%k2;"
 237          "or     %12,%10;"
 238          "add    %10,%k2;"
 239          "movdqa 0x10(%13),%%xmm9;"
 240          "paddd  %%xmm5,%%xmm9;"
 241          "movdqa %%xmm9,%16;"
 242          "movdqa %%xmm4,%%xmm0;"
 243          "mov    %3,%10;"
 244          "ror    $0xe,%10;"
 245          "mov    %k2,%11;"
 246          "palignr $0x4,%%xmm7,%%xmm0;"
 247          "ror    $0x9,%11;"
 248          "xor    %3,%10;"
 249          "mov    %4,%12;"
 250          "ror    $0x5,%10;"
 251          "movdqa %%xmm6,%%xmm1;"
 252          "xor    %k2,%11;"
 253          "xor    %5,%12;"
 254          "paddd  %%xmm5,%%xmm0;"
 255          "xor    %3,%10;"
 256          "and    %3,%12;"
 257          "ror    $0xb,%11;"
 258          "palignr $0x4,%%xmm5,%%xmm1;"
 259          "xor    %k2,%11;"
 260          "ror    $0x6,%10;"
 261          "xor    %5,%12;"
 262          "movdqa %%xmm1,%%xmm2;"
 263          "ror    $0x2,%11;"
 264          "add    %10,%12;"
 265          "add    %16,%12;"
 266          "movdqa %%xmm1,%%xmm3;"
 267          "mov    %k2,%10;"
 268          "add    %12,%6;"
 269          "mov    %k2,%12;"
 270          "pslld  $0x19,%%xmm1;"
 271          "or     %8,%10;"
 272          "add    %6,%9;"
 273          "and    %8,%12;"
 274          "psrld  $0x7,%%xmm2;"
 275          "and    %7,%10;"
 276          "add    %11,%6;"
 277          "por    %%xmm2,%%xmm1;"
 278          "or     %12,%10;"
 279          "add    %10,%6;"
 280          "movdqa %%xmm3,%%xmm2;"
 281          "mov    %9,%10;"
 282          "mov    %6,%11;"
 283          "movdqa %%xmm3,%%xmm8;"
 284          "ror    $0xe,%10;"
 285          "xor    %9,%10;"
 286          "mov    %3,%12;"
 287          "ror    $0x9,%11;"
 288          "pslld  $0xe,%%xmm3;"
 289          "xor    %6,%11;"
 290          "ror    $0x5,%10;"
 291          "xor    %4,%12;"
 292          "psrld  $0x12,%%xmm2;"
 293          "ror    $0xb,%11;"
 294          "xor    %9,%10;"
 295          "and    %9,%12;"
 296          "ror    $0x6,%10;"
 297          "pxor   %%xmm3,%%xmm1;"
 298          "xor    %6,%11;"
 299          "xor    %4,%12;"
 300          "psrld  $0x3,%%xmm8;"
 301          "add    %10,%12;"
 302          "add    4+%16,%12;"
 303          "ror    $0x2,%11;"
 304          "pxor   %%xmm2,%%xmm1;"
 305          "mov    %6,%10;"
 306          "add    %12,%5;"
 307          "mov    %6,%12;"
 308          "pxor   %%xmm8,%%xmm1;"
 309          "or     %7,%10;"
 310          "add    %5,%8;"
 311          "and    %7,%12;"
 312          "pshufd $0xfa,%%xmm4,%%xmm2;"
 313          "and    %k2,%10;"
 314          "add    %11,%5;"
 315          "paddd  %%xmm1,%%xmm0;"
 316          "or     %12,%10;"
 317          "add    %10,%5;"
 318          "movdqa %%xmm2,%%xmm3;"
 319          "mov    %8,%10;"
 320          "mov    %5,%11;"
 321          "ror    $0xe,%10;"
 322          "movdqa %%xmm2,%%xmm8;"
 323          "xor    %8,%10;"
 324          "ror    $0x9,%11;"
 325          "mov    %9,%12;"
 326          "xor    %5,%11;"
 327          "ror    $0x5,%10;"
 328          "psrlq  $0x11,%%xmm2;"
 329          "xor    %3,%12;"
 330          "psrlq  $0x13,%%xmm3;"
 331          "xor    %8,%10;"
 332          "and    %8,%12;"
 333          "psrld  $0xa,%%xmm8;"
 334          "ror    $0xb,%11;"
 335          "xor    %5,%11;"
 336          "xor    %3,%12;"
 337          "ror    $0x6,%10;"
 338          "pxor   %%xmm3,%%xmm2;"
 339          "add    %10,%12;"
 340          "ror    $0x2,%11;"
 341          "add    8+%16,%12;"
 342          "pxor   %%xmm2,%%xmm8;"
 343          "mov    %5,%10;"
 344          "add    %12,%4;"
 345          "mov    %5,%12;"
 346          "pshufb %%xmm10,%%xmm8;"
 347          "or     %k2,%10;"
 348          "add    %4,%7;"
 349          "and    %k2,%12;"
 350          "paddd  %%xmm8,%%xmm0;"
 351          "and    %6,%10;"
 352          "add    %11,%4;"
 353          "pshufd $0x50,%%xmm0,%%xmm2;"
 354          "or     %12,%10;"
 355          "add    %10,%4;"
 356          "movdqa %%xmm2,%%xmm3;"
 357          "mov    %7,%10;"
 358          "ror    $0xe,%10;"
 359          "mov    %4,%11;"
 360          "movdqa %%xmm2,%%xmm5;"
 361          "ror    $0x9,%11;"
 362          "xor    %7,%10;"
 363          "mov    %8,%12;"
 364          "ror    $0x5,%10;"
 365          "psrlq  $0x11,%%xmm2;"
 366          "xor    %4,%11;"
 367          "xor    %9,%12;"
 368          "psrlq  $0x13,%%xmm3;"
 369          "xor    %7,%10;"
 370          "and    %7,%12;"
 371          "ror    $0xb,%11;"
 372          "psrld  $0xa,%%xmm5;"
 373          "xor    %4,%11;"
 374          "ror    $0x6,%10;"
 375          "xor    %9,%12;"
 376          "pxor   %%xmm3,%%xmm2;"
 377          "ror    $0x2,%11;"
 378          "add    %10,%12;"
 379          "add    12+%16,%12;"
 380          "pxor   %%xmm2,%%xmm5;"
 381          "mov    %4,%10;"
 382          "add    %12,%3;"
 383          "mov    %4,%12;"
 384          "pshufb %%xmm11,%%xmm5;"
 385          "or     %6,%10;"
 386          "add    %3,%k2;"
 387          "and    %6,%12;"
 388          "paddd  %%xmm0,%%xmm5;"
 389          "and    %5,%10;"
 390          "add    %11,%3;"
 391          "or     %12,%10;"
 392          "add    %10,%3;"
 393          "movdqa 0x20(%13),%%xmm9;"
 394          "paddd  %%xmm6,%%xmm9;"
 395          "movdqa %%xmm9,%16;"
 396          "movdqa %%xmm5,%%xmm0;"
 397          "mov    %k2,%10;"
 398          "ror    $0xe,%10;"
 399          "mov    %3,%11;"
 400          "palignr $0x4,%%xmm4,%%xmm0;"
 401          "ror    $0x9,%11;"
 402          "xor    %k2,%10;"
 403          "mov    %7,%12;"
 404          "ror    $0x5,%10;"
 405          "movdqa %%xmm7,%%xmm1;"
 406          "xor    %3,%11;"
 407          "xor    %8,%12;"
 408          "paddd  %%xmm6,%%xmm0;"
 409          "xor    %k2,%10;"
 410          "and    %k2,%12;"
 411          "ror    $0xb,%11;"
 412          "palignr $0x4,%%xmm6,%%xmm1;"
 413          "xor    %3,%11;"
 414          "ror    $0x6,%10;"
 415          "xor    %8,%12;"
 416          "movdqa %%xmm1,%%xmm2;"
 417          "ror    $0x2,%11;"
 418          "add    %10,%12;"
 419          "add    %16,%12;"
 420          "movdqa %%xmm1,%%xmm3;"
 421          "mov    %3,%10;"
 422          "add    %12,%9;"
 423          "mov    %3,%12;"
 424          "pslld  $0x19,%%xmm1;"
 425          "or     %5,%10;"
 426          "add    %9,%6;"
 427          "and    %5,%12;"
 428          "psrld  $0x7,%%xmm2;"
 429          "and    %4,%10;"
 430          "add    %11,%9;"
 431          "por    %%xmm2,%%xmm1;"
 432          "or     %12,%10;"
 433          "add    %10,%9;"
 434          "movdqa %%xmm3,%%xmm2;"
 435          "mov    %6,%10;"
 436          "mov    %9,%11;"
 437          "movdqa %%xmm3,%%xmm8;"
 438          "ror    $0xe,%10;"
 439          "xor    %6,%10;"
 440          "mov    %k2,%12;"
 441          "ror    $0x9,%11;"
 442          "pslld  $0xe,%%xmm3;"
 443          "xor    %9,%11;"
 444          "ror    $0x5,%10;"
 445          "xor    %7,%12;"
 446          "psrld  $0x12,%%xmm2;"
 447          "ror    $0xb,%11;"
 448          "xor    %6,%10;"
 449          "and    %6,%12;"
 450          "ror    $0x6,%10;"
 451          "pxor   %%xmm3,%%xmm1;"
 452          "xor    %9,%11;"
 453          "xor    %7,%12;"
 454          "psrld  $0x3,%%xmm8;"
 455          "add    %10,%12;"
 456          "add    4+%16,%12;"
 457          "ror    $0x2,%11;"
 458          "pxor   %%xmm2,%%xmm1;"
 459          "mov    %9,%10;"
 460          "add    %12,%8;"
 461          "mov    %9,%12;"
 462          "pxor   %%xmm8,%%xmm1;"
 463          "or     %4,%10;"
 464          "add    %8,%5;"
 465          "and    %4,%12;"
 466          "pshufd $0xfa,%%xmm5,%%xmm2;"
 467          "and    %3,%10;"
 468          "add    %11,%8;"
 469          "paddd  %%xmm1,%%xmm0;"
 470          "or     %12,%10;"
 471          "add    %10,%8;"
 472          "movdqa %%xmm2,%%xmm3;"
 473          "mov    %5,%10;"
 474          "mov    %8,%11;"
 475          "ror    $0xe,%10;"
 476          "movdqa %%xmm2,%%xmm8;"
 477          "xor    %5,%10;"
 478          "ror    $0x9,%11;"
 479          "mov    %6,%12;"
 480          "xor    %8,%11;"
 481          "ror    $0x5,%10;"
 482          "psrlq  $0x11,%%xmm2;"
 483          "xor    %k2,%12;"
 484          "psrlq  $0x13,%%xmm3;"
 485          "xor    %5,%10;"
 486          "and    %5,%12;"
 487          "psrld  $0xa,%%xmm8;"
 488          "ror    $0xb,%11;"
 489          "xor    %8,%11;"
 490          "xor    %k2,%12;"
 491          "ror    $0x6,%10;"
 492          "pxor   %%xmm3,%%xmm2;"
 493          "add    %10,%12;"
 494          "ror    $0x2,%11;"
 495          "add    8+%16,%12;"
 496          "pxor   %%xmm2,%%xmm8;"
 497          "mov    %8,%10;"
 498          "add    %12,%7;"
 499          "mov    %8,%12;"
 500          "pshufb %%xmm10,%%xmm8;"
 501          "or     %3,%10;"
 502          "add    %7,%4;"
 503          "and    %3,%12;"
 504          "paddd  %%xmm8,%%xmm0;"
 505          "and    %9,%10;"
 506          "add    %11,%7;"
 507          "pshufd $0x50,%%xmm0,%%xmm2;"
 508          "or     %12,%10;"
 509          "add    %10,%7;"
 510          "movdqa %%xmm2,%%xmm3;"
 511          "mov    %4,%10;"
 512          "ror    $0xe,%10;"
 513          "mov    %7,%11;"
 514          "movdqa %%xmm2,%%xmm6;"
 515          "ror    $0x9,%11;"
 516          "xor    %4,%10;"
 517          "mov    %5,%12;"
 518          "ror    $0x5,%10;"
 519          "psrlq  $0x11,%%xmm2;"
 520          "xor    %7,%11;"
 521          "xor    %6,%12;"
 522          "psrlq  $0x13,%%xmm3;"
 523          "xor    %4,%10;"
 524          "and    %4,%12;"
 525          "ror    $0xb,%11;"
 526          "psrld  $0xa,%%xmm6;"
 527          "xor    %7,%11;"
 528          "ror    $0x6,%10;"
 529          "xor    %6,%12;"
 530          "pxor   %%xmm3,%%xmm2;"
 531          "ror    $0x2,%11;"
 532          "add    %10,%12;"
 533          "add    12+%16,%12;"
 534          "pxor   %%xmm2,%%xmm6;"
 535          "mov    %7,%10;"
 536          "add    %12,%k2;"
 537          "mov    %7,%12;"
 538          "pshufb %%xmm11,%%xmm6;"
 539          "or     %9,%10;"
 540          "add    %k2,%3;"
 541          "and    %9,%12;"
 542          "paddd  %%xmm0,%%xmm6;"
 543          "and    %8,%10;"
 544          "add    %11,%k2;"
 545          "or     %12,%10;"
 546          "add    %10,%k2;"
 547          "movdqa 0x30(%13),%%xmm9;"
 548          "paddd  %%xmm7,%%xmm9;"
 549          "movdqa %%xmm9,%16;"
 550          "add    $0x40,%13;"
 551          "movdqa %%xmm6,%%xmm0;"
 552          "mov    %3,%10;"
 553          "ror    $0xe,%10;"
 554          "mov    %k2,%11;"
 555          "palignr $0x4,%%xmm5,%%xmm0;"
 556          "ror    $0x9,%11;"
 557          "xor    %3,%10;"
 558          "mov    %4,%12;"
 559          "ror    $0x5,%10;"
 560          "movdqa %%xmm4,%%xmm1;"
 561          "xor    %k2,%11;"
 562          "xor    %5,%12;"
 563          "paddd  %%xmm7,%%xmm0;"
 564          "xor    %3,%10;"
 565          "and    %3,%12;"
 566          "ror    $0xb,%11;"
 567          "palignr $0x4,%%xmm7,%%xmm1;"
 568          "xor    %k2,%11;"
 569          "ror    $0x6,%10;"
 570          "xor    %5,%12;"
 571          "movdqa %%xmm1,%%xmm2;"
 572          "ror    $0x2,%11;"
 573          "add    %10,%12;"
 574          "add    %16,%12;"
 575          "movdqa %%xmm1,%%xmm3;"
 576          "mov    %k2,%10;"
 577          "add    %12,%6;"
 578          "mov    %k2,%12;"
 579          "pslld  $0x19,%%xmm1;"
 580          "or     %8,%10;"
 581          "add    %6,%9;"
 582          "and    %8,%12;"
 583          "psrld  $0x7,%%xmm2;"
 584          "and    %7,%10;"
 585          "add    %11,%6;"
 586          "por    %%xmm2,%%xmm1;"
 587          "or     %12,%10;"
 588          "add    %10,%6;"
 589          "movdqa %%xmm3,%%xmm2;"
 590          "mov    %9,%10;"
 591          "mov    %6,%11;"
 592          "movdqa %%xmm3,%%xmm8;"
 593          "ror    $0xe,%10;"
 594          "xor    %9,%10;"
 595          "mov    %3,%12;"
 596          "ror    $0x9,%11;"
 597          "pslld  $0xe,%%xmm3;"
 598          "xor    %6,%11;"
 599          "ror    $0x5,%10;"
 600          "xor    %4,%12;"
 601          "psrld  $0x12,%%xmm2;"
 602          "ror    $0xb,%11;"
 603          "xor    %9,%10;"
 604          "and    %9,%12;"
 605          "ror    $0x6,%10;"
 606          "pxor   %%xmm3,%%xmm1;"
 607          "xor    %6,%11;"
 608          "xor    %4,%12;"
 609          "psrld  $0x3,%%xmm8;"
 610          "add    %10,%12;"
 611          "add    4+%16,%12;"
 612          "ror    $0x2,%11;"
 613          "pxor   %%xmm2,%%xmm1;"
 614          "mov    %6,%10;"
 615          "add    %12,%5;"
 616          "mov    %6,%12;"
 617          "pxor   %%xmm8,%%xmm1;"
 618          "or     %7,%10;"
 619          "add    %5,%8;"
 620          "and    %7,%12;"
 621          "pshufd $0xfa,%%xmm6,%%xmm2;"
 622          "and    %k2,%10;"
 623          "add    %11,%5;"
 624          "paddd  %%xmm1,%%xmm0;"
 625          "or     %12,%10;"
 626          "add    %10,%5;"
 627          "movdqa %%xmm2,%%xmm3;"
 628          "mov    %8,%10;"
 629          "mov    %5,%11;"
 630          "ror    $0xe,%10;"
 631          "movdqa %%xmm2,%%xmm8;"
 632          "xor    %8,%10;"
 633          "ror    $0x9,%11;"
 634          "mov    %9,%12;"
 635          "xor    %5,%11;"
 636          "ror    $0x5,%10;"
 637          "psrlq  $0x11,%%xmm2;"
 638          "xor    %3,%12;"
 639          "psrlq  $0x13,%%xmm3;"
 640          "xor    %8,%10;"
 641          "and    %8,%12;"
 642          "psrld  $0xa,%%xmm8;"
 643          "ror    $0xb,%11;"
 644          "xor    %5,%11;"
 645          "xor    %3,%12;"
 646          "ror    $0x6,%10;"
 647          "pxor   %%xmm3,%%xmm2;"
 648          "add    %10,%12;"
 649          "ror    $0x2,%11;"
 650          "add    8+%16,%12;"
 651          "pxor   %%xmm2,%%xmm8;"
 652          "mov    %5,%10;"
 653          "add    %12,%4;"
 654          "mov    %5,%12;"
 655          "pshufb %%xmm10,%%xmm8;"
 656          "or     %k2,%10;"
 657          "add    %4,%7;"
 658          "and    %k2,%12;"
 659          "paddd  %%xmm8,%%xmm0;"
 660          "and    %6,%10;"
 661          "add    %11,%4;"
 662          "pshufd $0x50,%%xmm0,%%xmm2;"
 663          "or     %12,%10;"
 664          "add    %10,%4;"
 665          "movdqa %%xmm2,%%xmm3;"
 666          "mov    %7,%10;"
 667          "ror    $0xe,%10;"
 668          "mov    %4,%11;"
 669          "movdqa %%xmm2,%%xmm7;"
 670          "ror    $0x9,%11;"
 671          "xor    %7,%10;"
 672          "mov    %8,%12;"
 673          "ror    $0x5,%10;"
 674          "psrlq  $0x11,%%xmm2;"
 675          "xor    %4,%11;"
 676          "xor    %9,%12;"
 677          "psrlq  $0x13,%%xmm3;"
 678          "xor    %7,%10;"
 679          "and    %7,%12;"
 680          "ror    $0xb,%11;"
 681          "psrld  $0xa,%%xmm7;"
 682          "xor    %4,%11;"
 683          "ror    $0x6,%10;"
 684          "xor    %9,%12;"
 685          "pxor   %%xmm3,%%xmm2;"
 686          "ror    $0x2,%11;"
 687          "add    %10,%12;"
 688          "add    12+%16,%12;"
 689          "pxor   %%xmm2,%%xmm7;"
 690          "mov    %4,%10;"
 691          "add    %12,%3;"
 692          "mov    %4,%12;"
 693          "pshufb %%xmm11,%%xmm7;"
 694          "or     %6,%10;"
 695          "add    %3,%k2;"
 696          "and    %6,%12;"
 697          "paddd  %%xmm0,%%xmm7;"
 698          "and    %5,%10;"
 699          "add    %11,%3;"
 700          "or     %12,%10;"
 701          "add    %10,%3;"
 702          "sub    $0x1,%1;"
 703          "jne    Lloop1_%=;"
 704          "mov    $0x2,%1;"
 705  
 706          "Lloop2_%=:"
 707          "paddd  0x0(%13),%%xmm4;"
 708          "movdqa %%xmm4,%16;"
 709          "mov    %k2,%10;"
 710          "ror    $0xe,%10;"
 711          "mov    %3,%11;"
 712          "xor    %k2,%10;"
 713          "ror    $0x9,%11;"
 714          "mov    %7,%12;"
 715          "xor    %3,%11;"
 716          "ror    $0x5,%10;"
 717          "xor    %8,%12;"
 718          "xor    %k2,%10;"
 719          "ror    $0xb,%11;"
 720          "and    %k2,%12;"
 721          "xor    %3,%11;"
 722          "ror    $0x6,%10;"
 723          "xor    %8,%12;"
 724          "add    %10,%12;"
 725          "ror    $0x2,%11;"
 726          "add    %16,%12;"
 727          "mov    %3,%10;"
 728          "add    %12,%9;"
 729          "mov    %3,%12;"
 730          "or     %5,%10;"
 731          "add    %9,%6;"
 732          "and    %5,%12;"
 733          "and    %4,%10;"
 734          "add    %11,%9;"
 735          "or     %12,%10;"
 736          "add    %10,%9;"
 737          "mov    %6,%10;"
 738          "ror    $0xe,%10;"
 739          "mov    %9,%11;"
 740          "xor    %6,%10;"
 741          "ror    $0x9,%11;"
 742          "mov    %k2,%12;"
 743          "xor    %9,%11;"
 744          "ror    $0x5,%10;"
 745          "xor    %7,%12;"
 746          "xor    %6,%10;"
 747          "ror    $0xb,%11;"
 748          "and    %6,%12;"
 749          "xor    %9,%11;"
 750          "ror    $0x6,%10;"
 751          "xor    %7,%12;"
 752          "add    %10,%12;"
 753          "ror    $0x2,%11;"
 754          "add    4+%16,%12;"
 755          "mov    %9,%10;"
 756          "add    %12,%8;"
 757          "mov    %9,%12;"
 758          "or     %4,%10;"
 759          "add    %8,%5;"
 760          "and    %4,%12;"
 761          "and    %3,%10;"
 762          "add    %11,%8;"
 763          "or     %12,%10;"
 764          "add    %10,%8;"
 765          "mov    %5,%10;"
 766          "ror    $0xe,%10;"
 767          "mov    %8,%11;"
 768          "xor    %5,%10;"
 769          "ror    $0x9,%11;"
 770          "mov    %6,%12;"
 771          "xor    %8,%11;"
 772          "ror    $0x5,%10;"
 773          "xor    %k2,%12;"
 774          "xor    %5,%10;"
 775          "ror    $0xb,%11;"
 776          "and    %5,%12;"
 777          "xor    %8,%11;"
 778          "ror    $0x6,%10;"
 779          "xor    %k2,%12;"
 780          "add    %10,%12;"
 781          "ror    $0x2,%11;"
 782          "add    8+%16,%12;"
 783          "mov    %8,%10;"
 784          "add    %12,%7;"
 785          "mov    %8,%12;"
 786          "or     %3,%10;"
 787          "add    %7,%4;"
 788          "and    %3,%12;"
 789          "and    %9,%10;"
 790          "add    %11,%7;"
 791          "or     %12,%10;"
 792          "add    %10,%7;"
 793          "mov    %4,%10;"
 794          "ror    $0xe,%10;"
 795          "mov    %7,%11;"
 796          "xor    %4,%10;"
 797          "ror    $0x9,%11;"
 798          "mov    %5,%12;"
 799          "xor    %7,%11;"
 800          "ror    $0x5,%10;"
 801          "xor    %6,%12;"
 802          "xor    %4,%10;"
 803          "ror    $0xb,%11;"
 804          "and    %4,%12;"
 805          "xor    %7,%11;"
 806          "ror    $0x6,%10;"
 807          "xor    %6,%12;"
 808          "add    %10,%12;"
 809          "ror    $0x2,%11;"
 810          "add    12+%16,%12;"
 811          "mov    %7,%10;"
 812          "add    %12,%k2;"
 813          "mov    %7,%12;"
 814          "or     %9,%10;"
 815          "add    %k2,%3;"
 816          "and    %9,%12;"
 817          "and    %8,%10;"
 818          "add    %11,%k2;"
 819          "or     %12,%10;"
 820          "add    %10,%k2;"
 821          "paddd  0x10(%13),%%xmm5;"
 822          "movdqa %%xmm5,%16;"
 823          "add    $0x20,%13;"
 824          "mov    %3,%10;"
 825          "ror    $0xe,%10;"
 826          "mov    %k2,%11;"
 827          "xor    %3,%10;"
 828          "ror    $0x9,%11;"
 829          "mov    %4,%12;"
 830          "xor    %k2,%11;"
 831          "ror    $0x5,%10;"
 832          "xor    %5,%12;"
 833          "xor    %3,%10;"
 834          "ror    $0xb,%11;"
 835          "and    %3,%12;"
 836          "xor    %k2,%11;"
 837          "ror    $0x6,%10;"
 838          "xor    %5,%12;"
 839          "add    %10,%12;"
 840          "ror    $0x2,%11;"
 841          "add    %16,%12;"
 842          "mov    %k2,%10;"
 843          "add    %12,%6;"
 844          "mov    %k2,%12;"
 845          "or     %8,%10;"
 846          "add    %6,%9;"
 847          "and    %8,%12;"
 848          "and    %7,%10;"
 849          "add    %11,%6;"
 850          "or     %12,%10;"
 851          "add    %10,%6;"
 852          "mov    %9,%10;"
 853          "ror    $0xe,%10;"
 854          "mov    %6,%11;"
 855          "xor    %9,%10;"
 856          "ror    $0x9,%11;"
 857          "mov    %3,%12;"
 858          "xor    %6,%11;"
 859          "ror    $0x5,%10;"
 860          "xor    %4,%12;"
 861          "xor    %9,%10;"
 862          "ror    $0xb,%11;"
 863          "and    %9,%12;"
 864          "xor    %6,%11;"
 865          "ror    $0x6,%10;"
 866          "xor    %4,%12;"
 867          "add    %10,%12;"
 868          "ror    $0x2,%11;"
 869          "add    4+%16,%12;"
 870          "mov    %6,%10;"
 871          "add    %12,%5;"
 872          "mov    %6,%12;"
 873          "or     %7,%10;"
 874          "add    %5,%8;"
 875          "and    %7,%12;"
 876          "and    %k2,%10;"
 877          "add    %11,%5;"
 878          "or     %12,%10;"
 879          "add    %10,%5;"
 880          "mov    %8,%10;"
 881          "ror    $0xe,%10;"
 882          "mov    %5,%11;"
 883          "xor    %8,%10;"
 884          "ror    $0x9,%11;"
 885          "mov    %9,%12;"
 886          "xor    %5,%11;"
 887          "ror    $0x5,%10;"
 888          "xor    %3,%12;"
 889          "xor    %8,%10;"
 890          "ror    $0xb,%11;"
 891          "and    %8,%12;"
 892          "xor    %5,%11;"
 893          "ror    $0x6,%10;"
 894          "xor    %3,%12;"
 895          "add    %10,%12;"
 896          "ror    $0x2,%11;"
 897          "add    8+%16,%12;"
 898          "mov    %5,%10;"
 899          "add    %12,%4;"
 900          "mov    %5,%12;"
 901          "or     %k2,%10;"
 902          "add    %4,%7;"
 903          "and    %k2,%12;"
 904          "and    %6,%10;"
 905          "add    %11,%4;"
 906          "or     %12,%10;"
 907          "add    %10,%4;"
 908          "mov    %7,%10;"
 909          "ror    $0xe,%10;"
 910          "mov    %4,%11;"
 911          "xor    %7,%10;"
 912          "ror    $0x9,%11;"
 913          "mov    %8,%12;"
 914          "xor    %4,%11;"
 915          "ror    $0x5,%10;"
 916          "xor    %9,%12;"
 917          "xor    %7,%10;"
 918          "ror    $0xb,%11;"
 919          "and    %7,%12;"
 920          "xor    %4,%11;"
 921          "ror    $0x6,%10;"
 922          "xor    %9,%12;"
 923          "add    %10,%12;"
 924          "ror    $0x2,%11;"
 925          "add    12+%16,%12;"
 926          "mov    %4,%10;"
 927          "add    %12,%3;"
 928          "mov    %4,%12;"
 929          "or     %6,%10;"
 930          "add    %3,%k2;"
 931          "and    %6,%12;"
 932          "and    %5,%10;"
 933          "add    %11,%3;"
 934          "or     %12,%10;"
 935          "add    %10,%3;"
 936          "movdqa %%xmm6,%%xmm4;"
 937          "movdqa %%xmm7,%%xmm5;"
 938          "sub    $0x1,%1;"
 939          "jne    Lloop2_%=;"
 940          "add    (%0),%3;"
 941          "mov    %3,(%0);"
 942          "add    0x4(%0),%4;"
 943          "mov    %4,0x4(%0);"
 944          "add    0x8(%0),%5;"
 945          "mov    %5,0x8(%0);"
 946          "add    0xc(%0),%6;"
 947          "mov    %6,0xc(%0);"
 948          "add    0x10(%0),%k2;"
 949          "mov    %k2,0x10(%0);"
 950          "add    0x14(%0),%7;"
 951          "mov    %7,0x14(%0);"
 952          "add    0x18(%0),%8;"
 953          "mov    %8,0x18(%0);"
 954          "add    0x1c(%0),%9;"
 955          "mov    %9,0x1c(%0);"
 956          "mov    %15,%1;"
 957          "add    $0x40,%1;"
 958          "cmp    %14,%1;"
 959          "jne    Lloop0_%=;"
 960  
 961          "Ldone_hash_%=:"
 962  
 963          : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
 964          : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
 965          : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
 966     );
 967  }
 968  }
 969  
 970  /*
 971  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 972  ; Copyright (c) 2012, Intel Corporation
 973  ;
 974  ; All rights reserved.
 975  ;
 976  ; Redistribution and use in source and binary forms, with or without
 977  ; modification, are permitted provided that the following conditions are
 978  ; met:
 979  ;
 980  ; * Redistributions of source code must retain the above copyright
 981  ;   notice, this list of conditions and the following disclaimer.
 982  ;
 983  ; * Redistributions in binary form must reproduce the above copyright
 984  ;   notice, this list of conditions and the following disclaimer in the
 985  ;   documentation and/or other materials provided with the
 986  ;   distribution.
 987  ;
 988  ; * Neither the name of the Intel Corporation nor the names of its
 989  ;   contributors may be used to endorse or promote products derived from
 990  ;   this software without specific prior written permission.
 991  ;
 992  ;
 993  ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 994  ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 995  ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 996  ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 997  ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 998  ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 999  ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
1000  ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
1001  ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
1002  ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1003  ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1004  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1005  ;
1006  ; Example YASM command lines:
1007  ; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
1008  ; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
1009  ;
1010  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1011  ;
1012  ; This code is described in an Intel White-Paper:
1013  ; "Fast SHA-256 Implementations on Intel Architecture Processors"
1014  ;
1015  ; To find it, surf to https://www.intel.com/p/en_US/embedded
1016  ; and search for that title.
1017  ; The paper is expected to be released roughly at the end of April, 2012
1018  ;
1019  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1020  ; This code schedules 1 blocks at a time, with 4 lanes per block
1021  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1022  
1023  %define MOVDQ movdqu ;; assume buffers not aligned
1024  
1025  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1026  
1027  ; addm [mem], reg
1028  ; Add reg to mem using reg-mem add and store
1029  %macro addm 2
1030      add %2, %1
1031      mov %1, %2
1032  %endm
1033  
1034  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1035  
1036  ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1037  ; Load xmm with mem and byte swap each dword
1038  %macro COPY_XMM_AND_BSWAP 3
1039      MOVDQ %1, %2
1040      pshufb %1, %3
1041  %endmacro
1042  
1043  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1044  
1045  %define X0 xmm4
1046  %define X1 xmm5
1047  %define X2 xmm6
1048  %define X3 xmm7
1049  
1050  %define XTMP0 xmm0
1051  %define XTMP1 xmm1
1052  %define XTMP2 xmm2
1053  %define XTMP3 xmm3
1054  %define XTMP4 xmm8
1055  %define XFER  xmm9
1056  
1057  %define SHUF_00BA   xmm10 ; shuffle xBxA -> 00BA
1058  %define SHUF_DC00   xmm11 ; shuffle xDxC -> DC00
1059  %define BYTE_FLIP_MASK  xmm12
1060  
1061  %ifdef LINUX
1062  %define NUM_BLKS rdx    ; 3rd arg
1063  %define CTX rsi ; 2nd arg
1064  %define INP rdi ; 1st arg
1065  
1066  %define SRND    rdi ; clobbers INP
1067  %define c   ecx
1068  %define d   r8d
1069  %define e   edx
1070  %else
1071  %define NUM_BLKS r8 ; 3rd arg
1072  %define CTX rdx     ; 2nd arg
1073  %define INP rcx     ; 1st arg
1074  
1075  %define SRND    rcx ; clobbers INP
1076  %define c   edi
1077  %define d   esi
1078  %define e   r8d
1079  
1080  %endif
1081  %define TBL rbp
1082  %define a eax
1083  %define b ebx
1084  
1085  %define f r9d
1086  %define g r10d
1087  %define h r11d
1088  
1089  %define y0 r13d
1090  %define y1 r14d
1091  %define y2 r15d
1092  
1093  
1094  
1095  _INP_END_SIZE   equ 8
1096  _INP_SIZE   equ 8
1097  _XFER_SIZE  equ 8
1098  %ifdef LINUX
1099  _XMM_SAVE_SIZE  equ 0
1100  %else
1101  _XMM_SAVE_SIZE  equ 7*16
1102  %endif
1103  ; STACK_SIZE plus pushes must be an odd multiple of 8
1104  _ALIGN_SIZE equ 8
1105  
1106  _INP_END    equ 0
1107  _INP        equ _INP_END  + _INP_END_SIZE
1108  _XFER       equ _INP      + _INP_SIZE
1109  _XMM_SAVE   equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
1110  STACK_SIZE  equ _XMM_SAVE + _XMM_SAVE_SIZE
1111  
1112  ; rotate_Xs
1113  ; Rotate values of symbols X0...X3
1114  %macro rotate_Xs 0
1115  %xdefine X_ X0
1116  %xdefine X0 X1
1117  %xdefine X1 X2
1118  %xdefine X2 X3
1119  %xdefine X3 X_
1120  %endm
1121  
1122  ; ROTATE_ARGS
1123  ; Rotate values of symbols a...h
1124  %macro ROTATE_ARGS 0
1125  %xdefine TMP_ h
1126  %xdefine h g
1127  %xdefine g f
1128  %xdefine f e
1129  %xdefine e d
1130  %xdefine d c
1131  %xdefine c b
1132  %xdefine b a
1133  %xdefine a TMP_
1134  %endm
1135  
1136  %macro FOUR_ROUNDS_AND_SCHED 0
1137      ;; compute s0 four at a time and s1 two at a time
1138      ;; compute W[-16] + W[-7] 4 at a time
1139      movdqa  XTMP0, X3
1140      mov y0, e       ; y0 = e
1141      ror y0, (25-11) ; y0 = e >> (25-11)
1142      mov y1, a       ; y1 = a
1143      palignr XTMP0, X2, 4    ; XTMP0 = W[-7]
1144      ror y1, (22-13) ; y1 = a >> (22-13)
1145      xor y0, e       ; y0 = e ^ (e >> (25-11))
1146      mov y2, f       ; y2 = f
1147      ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1148      movdqa  XTMP1, X1
1149      xor y1, a       ; y1 = a ^ (a >> (22-13)
1150      xor y2, g       ; y2 = f^g
1151      paddd   XTMP0, X0   ; XTMP0 = W[-7] + W[-16]
1152      xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1153      and y2, e       ; y2 = (f^g)&e
1154      ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1155      ;; compute s0
1156      palignr XTMP1, X0, 4    ; XTMP1 = W[-15]
1157      xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1158      ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1159      xor y2, g       ; y2 = CH = ((f^g)&e)^g
1160      movdqa  XTMP2, XTMP1    ; XTMP2 = W[-15]
1161      ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1162      add y2, y0      ; y2 = S1 + CH
1163      add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1164      movdqa  XTMP3, XTMP1    ; XTMP3 = W[-15]
1165      mov y0, a       ; y0 = a
1166      add h, y2       ; h = h + S1 + CH + k + w
1167      mov y2, a       ; y2 = a
1168      pslld   XTMP1, (32-7)
1169      or  y0, c       ; y0 = a|c
1170      add d, h        ; d = d + h + S1 + CH + k + w
1171      and y2, c       ; y2 = a&c
1172      psrld   XTMP2, 7
1173      and y0, b       ; y0 = (a|c)&b
1174      add h, y1       ; h = h + S1 + CH + k + w + S0
1175      por XTMP1, XTMP2    ; XTMP1 = W[-15] ror 7
1176      or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1177      add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1178  
1179  ROTATE_ARGS
1180      movdqa  XTMP2, XTMP3    ; XTMP2 = W[-15]
1181      mov y0, e       ; y0 = e
1182      mov y1, a       ; y1 = a
1183      movdqa  XTMP4, XTMP3    ; XTMP4 = W[-15]
1184      ror y0, (25-11) ; y0 = e >> (25-11)
1185      xor y0, e       ; y0 = e ^ (e >> (25-11))
1186      mov y2, f       ; y2 = f
1187      ror y1, (22-13) ; y1 = a >> (22-13)
1188      pslld   XTMP3, (32-18)
1189      xor y1, a       ; y1 = a ^ (a >> (22-13)
1190      ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1191      xor y2, g       ; y2 = f^g
1192      psrld   XTMP2, 18
1193      ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1194      xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1195      and y2, e       ; y2 = (f^g)&e
1196      ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1197      pxor    XTMP1, XTMP3
1198      xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1199      xor y2, g       ; y2 = CH = ((f^g)&e)^g
1200      psrld   XTMP4, 3    ; XTMP4 = W[-15] >> 3
1201      add y2, y0      ; y2 = S1 + CH
1202      add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1203      ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1204      pxor    XTMP1, XTMP2    ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1205      mov y0, a       ; y0 = a
1206      add h, y2       ; h = h + S1 + CH + k + w
1207      mov y2, a       ; y2 = a
1208      pxor    XTMP1, XTMP4    ; XTMP1 = s0
1209      or  y0, c       ; y0 = a|c
1210      add d, h        ; d = d + h + S1 + CH + k + w
1211      and y2, c       ; y2 = a&c
1212      ;; compute low s1
1213      pshufd  XTMP2, X3, 11111010b    ; XTMP2 = W[-2] {BBAA}
1214      and y0, b       ; y0 = (a|c)&b
1215      add h, y1       ; h = h + S1 + CH + k + w + S0
1216      paddd   XTMP0, XTMP1    ; XTMP0 = W[-16] + W[-7] + s0
1217      or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1218      add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1219  
1220  ROTATE_ARGS
1221      movdqa  XTMP3, XTMP2    ; XTMP3 = W[-2] {BBAA}
1222      mov y0, e       ; y0 = e
1223      mov y1, a       ; y1 = a
1224      ror y0, (25-11) ; y0 = e >> (25-11)
1225      movdqa  XTMP4, XTMP2    ; XTMP4 = W[-2] {BBAA}
1226      xor y0, e       ; y0 = e ^ (e >> (25-11))
1227      ror y1, (22-13) ; y1 = a >> (22-13)
1228      mov y2, f       ; y2 = f
1229      xor y1, a       ; y1 = a ^ (a >> (22-13)
1230      ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1231      psrlq   XTMP2, 17   ; XTMP2 = W[-2] ror 17 {xBxA}
1232      xor y2, g       ; y2 = f^g
1233      psrlq   XTMP3, 19   ; XTMP3 = W[-2] ror 19 {xBxA}
1234      xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1235      and y2, e       ; y2 = (f^g)&e
1236      psrld   XTMP4, 10   ; XTMP4 = W[-2] >> 10 {BBAA}
1237      ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1238      xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1239      xor y2, g       ; y2 = CH = ((f^g)&e)^g
1240      ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1241      pxor    XTMP2, XTMP3
1242      add y2, y0      ; y2 = S1 + CH
1243      ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1244      add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1245      pxor    XTMP4, XTMP2    ; XTMP4 = s1 {xBxA}
1246      mov y0, a       ; y0 = a
1247      add h, y2       ; h = h + S1 + CH + k + w
1248      mov y2, a       ; y2 = a
1249      pshufb  XTMP4, SHUF_00BA    ; XTMP4 = s1 {00BA}
1250      or  y0, c       ; y0 = a|c
1251      add d, h        ; d = d + h + S1 + CH + k + w
1252      and y2, c       ; y2 = a&c
1253      paddd   XTMP0, XTMP4    ; XTMP0 = {..., ..., W[1], W[0]}
1254      and y0, b       ; y0 = (a|c)&b
1255      add h, y1       ; h = h + S1 + CH + k + w + S0
1256      ;; compute high s1
1257      pshufd  XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1258      or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1259      add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1260  
1261  ROTATE_ARGS
1262      movdqa  XTMP3, XTMP2    ; XTMP3 = W[-2] {DDCC}
1263      mov y0, e       ; y0 = e
1264      ror y0, (25-11) ; y0 = e >> (25-11)
1265      mov y1, a       ; y1 = a
1266      movdqa  X0,    XTMP2    ; X0    = W[-2] {DDCC}
1267      ror y1, (22-13) ; y1 = a >> (22-13)
1268      xor y0, e       ; y0 = e ^ (e >> (25-11))
1269      mov y2, f       ; y2 = f
1270      ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1271      psrlq   XTMP2, 17   ; XTMP2 = W[-2] ror 17 {xDxC}
1272      xor y1, a       ; y1 = a ^ (a >> (22-13)
1273      xor y2, g       ; y2 = f^g
1274      psrlq   XTMP3, 19   ; XTMP3 = W[-2] ror 19 {xDxC}
1275      xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1276      and y2, e       ; y2 = (f^g)&e
1277      ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1278      psrld   X0,    10   ; X0 = W[-2] >> 10 {DDCC}
1279      xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1280      ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1281      xor y2, g       ; y2 = CH = ((f^g)&e)^g
1282      pxor    XTMP2, XTMP3
1283      ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1284      add y2, y0      ; y2 = S1 + CH
1285      add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1286      pxor    X0, XTMP2   ; X0 = s1 {xDxC}
1287      mov y0, a       ; y0 = a
1288      add h, y2       ; h = h + S1 + CH + k + w
1289      mov y2, a       ; y2 = a
1290      pshufb  X0, SHUF_DC00   ; X0 = s1 {DC00}
1291      or  y0, c       ; y0 = a|c
1292      add d, h        ; d = d + h + S1 + CH + k + w
1293      and y2, c       ; y2 = a&c
1294      paddd   X0, XTMP0   ; X0 = {W[3], W[2], W[1], W[0]}
1295      and y0, b       ; y0 = (a|c)&b
1296      add h, y1       ; h = h + S1 + CH + k + w + S0
1297      or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1298      add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1299  
1300  ROTATE_ARGS
1301  rotate_Xs
1302  %endm
1303  
1304  ;; input is [rsp + _XFER + %1 * 4]
1305  %macro DO_ROUND 1
1306      mov y0, e       ; y0 = e
1307      ror y0, (25-11) ; y0 = e >> (25-11)
1308      mov y1, a       ; y1 = a
1309      xor y0, e       ; y0 = e ^ (e >> (25-11))
1310      ror y1, (22-13) ; y1 = a >> (22-13)
1311      mov y2, f       ; y2 = f
1312      xor y1, a       ; y1 = a ^ (a >> (22-13)
1313      ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1314      xor y2, g       ; y2 = f^g
1315      xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1316      ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1317      and y2, e       ; y2 = (f^g)&e
1318      xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1319      ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1320      xor y2, g       ; y2 = CH = ((f^g)&e)^g
1321      add y2, y0      ; y2 = S1 + CH
1322      ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1323      add y2, [rsp + _XFER + %1 * 4]  ; y2 = k + w + S1 + CH
1324      mov y0, a       ; y0 = a
1325      add h, y2       ; h = h + S1 + CH + k + w
1326      mov y2, a       ; y2 = a
1327      or  y0, c       ; y0 = a|c
1328      add d, h        ; d = d + h + S1 + CH + k + w
1329      and y2, c       ; y2 = a&c
1330      and y0, b       ; y0 = (a|c)&b
1331      add h, y1       ; h = h + S1 + CH + k + w + S0
1332      or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1333      add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1334      ROTATE_ARGS
1335  %endm
1336  
1337  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1338  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1339  ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1340  ;; arg 1 : pointer to input data
1341  ;; arg 2 : pointer to digest
1342  ;; arg 3 : Num blocks
1343  section .text
1344  global sha256_sse4
1345  align 32
1346  sha256_sse4:
1347      push    rbx
1348  %ifndef LINUX
1349      push    rsi
1350      push    rdi
1351  %endif
1352      push    rbp
1353      push    r13
1354      push    r14
1355      push    r15
1356  
1357      sub rsp,STACK_SIZE
1358  %ifndef LINUX
1359      movdqa  [rsp + _XMM_SAVE + 0*16],xmm6
1360      movdqa  [rsp + _XMM_SAVE + 1*16],xmm7
1361      movdqa  [rsp + _XMM_SAVE + 2*16],xmm8
1362      movdqa  [rsp + _XMM_SAVE + 3*16],xmm9
1363      movdqa  [rsp + _XMM_SAVE + 4*16],xmm10
1364      movdqa  [rsp + _XMM_SAVE + 5*16],xmm11
1365      movdqa  [rsp + _XMM_SAVE + 6*16],xmm12
1366  %endif
1367  
1368      shl NUM_BLKS, 6 ; convert to bytes
1369      jz  done_hash
1370      add NUM_BLKS, INP   ; pointer to end of data
1371      mov [rsp + _INP_END], NUM_BLKS
1372  
1373      ;; load initial digest
1374      mov a,[4*0 + CTX]
1375      mov b,[4*1 + CTX]
1376      mov c,[4*2 + CTX]
1377      mov d,[4*3 + CTX]
1378      mov e,[4*4 + CTX]
1379      mov f,[4*5 + CTX]
1380      mov g,[4*6 + CTX]
1381      mov h,[4*7 + CTX]
1382  
1383      movdqa  BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1384      movdqa  SHUF_00BA, [_SHUF_00BA wrt rip]
1385      movdqa  SHUF_DC00, [_SHUF_DC00 wrt rip]
1386  
1387  loop0:
1388      lea TBL,[K256 wrt rip]
1389  
1390      ;; byte swap first 16 dwords
1391      COPY_XMM_AND_BSWAP  X0, [INP + 0*16], BYTE_FLIP_MASK
1392      COPY_XMM_AND_BSWAP  X1, [INP + 1*16], BYTE_FLIP_MASK
1393      COPY_XMM_AND_BSWAP  X2, [INP + 2*16], BYTE_FLIP_MASK
1394      COPY_XMM_AND_BSWAP  X3, [INP + 3*16], BYTE_FLIP_MASK
1395  
1396      mov [rsp + _INP], INP
1397  
1398      ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1399      mov SRND, 3
1400  align 16
1401  loop1:
1402      movdqa  XFER, [TBL + 0*16]
1403      paddd   XFER, X0
1404      movdqa  [rsp + _XFER], XFER
1405      FOUR_ROUNDS_AND_SCHED
1406  
1407      movdqa  XFER, [TBL + 1*16]
1408      paddd   XFER, X0
1409      movdqa  [rsp + _XFER], XFER
1410      FOUR_ROUNDS_AND_SCHED
1411  
1412      movdqa  XFER, [TBL + 2*16]
1413      paddd   XFER, X0
1414      movdqa  [rsp + _XFER], XFER
1415      FOUR_ROUNDS_AND_SCHED
1416  
1417      movdqa  XFER, [TBL + 3*16]
1418      paddd   XFER, X0
1419      movdqa  [rsp + _XFER], XFER
1420      add TBL, 4*16
1421      FOUR_ROUNDS_AND_SCHED
1422  
1423      sub SRND, 1
1424      jne loop1
1425  
1426      mov SRND, 2
1427  loop2:
1428      paddd   X0, [TBL + 0*16]
1429      movdqa  [rsp + _XFER], X0
1430      DO_ROUND    0
1431      DO_ROUND    1
1432      DO_ROUND    2
1433      DO_ROUND    3
1434      paddd   X1, [TBL + 1*16]
1435      movdqa  [rsp + _XFER], X1
1436      add TBL, 2*16
1437      DO_ROUND    0
1438      DO_ROUND    1
1439      DO_ROUND    2
1440      DO_ROUND    3
1441  
1442      movdqa  X0, X2
1443      movdqa  X1, X3
1444  
1445      sub SRND, 1
1446      jne loop2
1447  
1448      addm    [4*0 + CTX],a
1449      addm    [4*1 + CTX],b
1450      addm    [4*2 + CTX],c
1451      addm    [4*3 + CTX],d
1452      addm    [4*4 + CTX],e
1453      addm    [4*5 + CTX],f
1454      addm    [4*6 + CTX],g
1455      addm    [4*7 + CTX],h
1456  
1457      mov INP, [rsp + _INP]
1458      add INP, 64
1459      cmp INP, [rsp + _INP_END]
1460      jne loop0
1461  
1462  done_hash:
1463  %ifndef LINUX
1464      movdqa  xmm6,[rsp + _XMM_SAVE + 0*16]
1465      movdqa  xmm7,[rsp + _XMM_SAVE + 1*16]
1466      movdqa  xmm8,[rsp + _XMM_SAVE + 2*16]
1467      movdqa  xmm9,[rsp + _XMM_SAVE + 3*16]
1468      movdqa  xmm10,[rsp + _XMM_SAVE + 4*16]
1469      movdqa  xmm11,[rsp + _XMM_SAVE + 5*16]
1470      movdqa  xmm12,[rsp + _XMM_SAVE + 6*16]
1471  %endif
1472  
1473      add rsp, STACK_SIZE
1474  
1475      pop r15
1476      pop r14
1477      pop r13
1478      pop rbp
1479  %ifndef LINUX
1480      pop rdi
1481      pop rsi
1482  %endif
1483      pop rbx
1484  
1485      ret
1486  
1487  
1488  section .data
1489  align 64
1490  K256:
1491      dd  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1492      dd  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1493      dd  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1494      dd  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1495      dd  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1496      dd  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1497      dd  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1498      dd  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1499      dd  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1500      dd  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1501      dd  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1502      dd  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1503      dd  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1504      dd  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1505      dd  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1506      dd  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1507  
1508  PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1509  
1510  ; shuffle xBxA -> 00BA
1511  _SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1512  
1513  ; shuffle xDxC -> DC00
1514  _SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1515  */
1516  
1517  #endif