/ src / crypto / sha256_sse4.cpp
sha256_sse4.cpp
   1  // Copyright (c) 2017-2022 The Bitcoin Core developers
   2  // Distributed under the MIT software license, see the accompanying
   3  // file COPYING or http://www.opensource.org/licenses/mit-license.php.
   4  //
   5  // This is a translation to GCC extended asm syntax from YASM code by Intel
   6  // (available at the bottom of this file).
   7  
   8  #include <cstdlib>
   9  #include <stdint.h>
  10  
  11  #if defined(__x86_64__) || defined(__amd64__)
  12  
  13  namespace sha256_sse4
  14  {
  15  void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
  16  {
  17      static const uint32_t K256 alignas(16) [] = {
  18          0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
  19          0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
  20          0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
  21          0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
  22          0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
  23          0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
  24          0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
  25          0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
  26          0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
  27          0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
  28          0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
  29          0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
  30          0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
  31          0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
  32          0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  33          0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
  34      };
  35      static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
  36      static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
  37      static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
  38      uint32_t a, b, c, d, f, g, h, y0, y1, y2;
  39      uint64_t tbl;
  40      uint64_t inp_end, inp;
  41      uint32_t xfer alignas(16) [4];
  42  
  43      __asm__ __volatile__(
  44          "shl    $0x6,%2;"
  45          "je     Ldone_hash_%=;"
  46          "add    %1,%2;"
  47          "mov    %2,%14;"
  48          "mov    (%0),%3;"
  49          "mov    0x4(%0),%4;"
  50          "mov    0x8(%0),%5;"
  51          "mov    0xc(%0),%6;"
  52          "mov    0x10(%0),%k2;"
  53          "mov    0x14(%0),%7;"
  54          "mov    0x18(%0),%8;"
  55          "mov    0x1c(%0),%9;"
  56          "movdqa %18,%%xmm12;"
  57          "movdqa %19,%%xmm10;"
  58          "movdqa %20,%%xmm11;"
  59  
  60          "Lloop0_%=:"
  61          "lea    %17,%13;"
  62          "movdqu (%1),%%xmm4;"
  63          "pshufb %%xmm12,%%xmm4;"
  64          "movdqu 0x10(%1),%%xmm5;"
  65          "pshufb %%xmm12,%%xmm5;"
  66          "movdqu 0x20(%1),%%xmm6;"
  67          "pshufb %%xmm12,%%xmm6;"
  68          "movdqu 0x30(%1),%%xmm7;"
  69          "pshufb %%xmm12,%%xmm7;"
  70          "mov    %1,%15;"
  71          "mov    $3,%1;"
  72  
  73          "Lloop1_%=:"
  74          "movdqa 0x0(%13),%%xmm9;"
  75          "paddd  %%xmm4,%%xmm9;"
  76          "movdqa %%xmm9,%16;"
  77          "movdqa %%xmm7,%%xmm0;"
  78          "mov    %k2,%10;"
  79          "ror    $0xe,%10;"
  80          "mov    %3,%11;"
  81          "palignr $0x4,%%xmm6,%%xmm0;"
  82          "ror    $0x9,%11;"
  83          "xor    %k2,%10;"
  84          "mov    %7,%12;"
  85          "ror    $0x5,%10;"
  86          "movdqa %%xmm5,%%xmm1;"
  87          "xor    %3,%11;"
  88          "xor    %8,%12;"
  89          "paddd  %%xmm4,%%xmm0;"
  90          "xor    %k2,%10;"
  91          "and    %k2,%12;"
  92          "ror    $0xb,%11;"
  93          "palignr $0x4,%%xmm4,%%xmm1;"
  94          "xor    %3,%11;"
  95          "ror    $0x6,%10;"
  96          "xor    %8,%12;"
  97          "movdqa %%xmm1,%%xmm2;"
  98          "ror    $0x2,%11;"
  99          "add    %10,%12;"
 100          "add    %16,%12;"
 101          "movdqa %%xmm1,%%xmm3;"
 102          "mov    %3,%10;"
 103          "add    %12,%9;"
 104          "mov    %3,%12;"
 105          "pslld  $0x19,%%xmm1;"
 106          "or     %5,%10;"
 107          "add    %9,%6;"
 108          "and    %5,%12;"
 109          "psrld  $0x7,%%xmm2;"
 110          "and    %4,%10;"
 111          "add    %11,%9;"
 112          "por    %%xmm2,%%xmm1;"
 113          "or     %12,%10;"
 114          "add    %10,%9;"
 115          "movdqa %%xmm3,%%xmm2;"
 116          "mov    %6,%10;"
 117          "mov    %9,%11;"
 118          "movdqa %%xmm3,%%xmm8;"
 119          "ror    $0xe,%10;"
 120          "xor    %6,%10;"
 121          "mov    %k2,%12;"
 122          "ror    $0x9,%11;"
 123          "pslld  $0xe,%%xmm3;"
 124          "xor    %9,%11;"
 125          "ror    $0x5,%10;"
 126          "xor    %7,%12;"
 127          "psrld  $0x12,%%xmm2;"
 128          "ror    $0xb,%11;"
 129          "xor    %6,%10;"
 130          "and    %6,%12;"
 131          "ror    $0x6,%10;"
 132          "pxor   %%xmm3,%%xmm1;"
 133          "xor    %9,%11;"
 134          "xor    %7,%12;"
 135          "psrld  $0x3,%%xmm8;"
 136          "add    %10,%12;"
 137          "add    4+%16,%12;"
 138          "ror    $0x2,%11;"
 139          "pxor   %%xmm2,%%xmm1;"
 140          "mov    %9,%10;"
 141          "add    %12,%8;"
 142          "mov    %9,%12;"
 143          "pxor   %%xmm8,%%xmm1;"
 144          "or     %4,%10;"
 145          "add    %8,%5;"
 146          "and    %4,%12;"
 147          "pshufd $0xfa,%%xmm7,%%xmm2;"
 148          "and    %3,%10;"
 149          "add    %11,%8;"
 150          "paddd  %%xmm1,%%xmm0;"
 151          "or     %12,%10;"
 152          "add    %10,%8;"
 153          "movdqa %%xmm2,%%xmm3;"
 154          "mov    %5,%10;"
 155          "mov    %8,%11;"
 156          "ror    $0xe,%10;"
 157          "movdqa %%xmm2,%%xmm8;"
 158          "xor    %5,%10;"
 159          "ror    $0x9,%11;"
 160          "mov    %6,%12;"
 161          "xor    %8,%11;"
 162          "ror    $0x5,%10;"
 163          "psrlq  $0x11,%%xmm2;"
 164          "xor    %k2,%12;"
 165          "psrlq  $0x13,%%xmm3;"
 166          "xor    %5,%10;"
 167          "and    %5,%12;"
 168          "psrld  $0xa,%%xmm8;"
 169          "ror    $0xb,%11;"
 170          "xor    %8,%11;"
 171          "xor    %k2,%12;"
 172          "ror    $0x6,%10;"
 173          "pxor   %%xmm3,%%xmm2;"
 174          "add    %10,%12;"
 175          "ror    $0x2,%11;"
 176          "add    8+%16,%12;"
 177          "pxor   %%xmm2,%%xmm8;"
 178          "mov    %8,%10;"
 179          "add    %12,%7;"
 180          "mov    %8,%12;"
 181          "pshufb %%xmm10,%%xmm8;"
 182          "or     %3,%10;"
 183          "add    %7,%4;"
 184          "and    %3,%12;"
 185          "paddd  %%xmm8,%%xmm0;"
 186          "and    %9,%10;"
 187          "add    %11,%7;"
 188          "pshufd $0x50,%%xmm0,%%xmm2;"
 189          "or     %12,%10;"
 190          "add    %10,%7;"
 191          "movdqa %%xmm2,%%xmm3;"
 192          "mov    %4,%10;"
 193          "ror    $0xe,%10;"
 194          "mov    %7,%11;"
 195          "movdqa %%xmm2,%%xmm4;"
 196          "ror    $0x9,%11;"
 197          "xor    %4,%10;"
 198          "mov    %5,%12;"
 199          "ror    $0x5,%10;"
 200          "psrlq  $0x11,%%xmm2;"
 201          "xor    %7,%11;"
 202          "xor    %6,%12;"
 203          "psrlq  $0x13,%%xmm3;"
 204          "xor    %4,%10;"
 205          "and    %4,%12;"
 206          "ror    $0xb,%11;"
 207          "psrld  $0xa,%%xmm4;"
 208          "xor    %7,%11;"
 209          "ror    $0x6,%10;"
 210          "xor    %6,%12;"
 211          "pxor   %%xmm3,%%xmm2;"
 212          "ror    $0x2,%11;"
 213          "add    %10,%12;"
 214          "add    12+%16,%12;"
 215          "pxor   %%xmm2,%%xmm4;"
 216          "mov    %7,%10;"
 217          "add    %12,%k2;"
 218          "mov    %7,%12;"
 219          "pshufb %%xmm11,%%xmm4;"
 220          "or     %9,%10;"
 221          "add    %k2,%3;"
 222          "and    %9,%12;"
 223          "paddd  %%xmm0,%%xmm4;"
 224          "and    %8,%10;"
 225          "add    %11,%k2;"
 226          "or     %12,%10;"
 227          "add    %10,%k2;"
 228          "movdqa 0x10(%13),%%xmm9;"
 229          "paddd  %%xmm5,%%xmm9;"
 230          "movdqa %%xmm9,%16;"
 231          "movdqa %%xmm4,%%xmm0;"
 232          "mov    %3,%10;"
 233          "ror    $0xe,%10;"
 234          "mov    %k2,%11;"
 235          "palignr $0x4,%%xmm7,%%xmm0;"
 236          "ror    $0x9,%11;"
 237          "xor    %3,%10;"
 238          "mov    %4,%12;"
 239          "ror    $0x5,%10;"
 240          "movdqa %%xmm6,%%xmm1;"
 241          "xor    %k2,%11;"
 242          "xor    %5,%12;"
 243          "paddd  %%xmm5,%%xmm0;"
 244          "xor    %3,%10;"
 245          "and    %3,%12;"
 246          "ror    $0xb,%11;"
 247          "palignr $0x4,%%xmm5,%%xmm1;"
 248          "xor    %k2,%11;"
 249          "ror    $0x6,%10;"
 250          "xor    %5,%12;"
 251          "movdqa %%xmm1,%%xmm2;"
 252          "ror    $0x2,%11;"
 253          "add    %10,%12;"
 254          "add    %16,%12;"
 255          "movdqa %%xmm1,%%xmm3;"
 256          "mov    %k2,%10;"
 257          "add    %12,%6;"
 258          "mov    %k2,%12;"
 259          "pslld  $0x19,%%xmm1;"
 260          "or     %8,%10;"
 261          "add    %6,%9;"
 262          "and    %8,%12;"
 263          "psrld  $0x7,%%xmm2;"
 264          "and    %7,%10;"
 265          "add    %11,%6;"
 266          "por    %%xmm2,%%xmm1;"
 267          "or     %12,%10;"
 268          "add    %10,%6;"
 269          "movdqa %%xmm3,%%xmm2;"
 270          "mov    %9,%10;"
 271          "mov    %6,%11;"
 272          "movdqa %%xmm3,%%xmm8;"
 273          "ror    $0xe,%10;"
 274          "xor    %9,%10;"
 275          "mov    %3,%12;"
 276          "ror    $0x9,%11;"
 277          "pslld  $0xe,%%xmm3;"
 278          "xor    %6,%11;"
 279          "ror    $0x5,%10;"
 280          "xor    %4,%12;"
 281          "psrld  $0x12,%%xmm2;"
 282          "ror    $0xb,%11;"
 283          "xor    %9,%10;"
 284          "and    %9,%12;"
 285          "ror    $0x6,%10;"
 286          "pxor   %%xmm3,%%xmm1;"
 287          "xor    %6,%11;"
 288          "xor    %4,%12;"
 289          "psrld  $0x3,%%xmm8;"
 290          "add    %10,%12;"
 291          "add    4+%16,%12;"
 292          "ror    $0x2,%11;"
 293          "pxor   %%xmm2,%%xmm1;"
 294          "mov    %6,%10;"
 295          "add    %12,%5;"
 296          "mov    %6,%12;"
 297          "pxor   %%xmm8,%%xmm1;"
 298          "or     %7,%10;"
 299          "add    %5,%8;"
 300          "and    %7,%12;"
 301          "pshufd $0xfa,%%xmm4,%%xmm2;"
 302          "and    %k2,%10;"
 303          "add    %11,%5;"
 304          "paddd  %%xmm1,%%xmm0;"
 305          "or     %12,%10;"
 306          "add    %10,%5;"
 307          "movdqa %%xmm2,%%xmm3;"
 308          "mov    %8,%10;"
 309          "mov    %5,%11;"
 310          "ror    $0xe,%10;"
 311          "movdqa %%xmm2,%%xmm8;"
 312          "xor    %8,%10;"
 313          "ror    $0x9,%11;"
 314          "mov    %9,%12;"
 315          "xor    %5,%11;"
 316          "ror    $0x5,%10;"
 317          "psrlq  $0x11,%%xmm2;"
 318          "xor    %3,%12;"
 319          "psrlq  $0x13,%%xmm3;"
 320          "xor    %8,%10;"
 321          "and    %8,%12;"
 322          "psrld  $0xa,%%xmm8;"
 323          "ror    $0xb,%11;"
 324          "xor    %5,%11;"
 325          "xor    %3,%12;"
 326          "ror    $0x6,%10;"
 327          "pxor   %%xmm3,%%xmm2;"
 328          "add    %10,%12;"
 329          "ror    $0x2,%11;"
 330          "add    8+%16,%12;"
 331          "pxor   %%xmm2,%%xmm8;"
 332          "mov    %5,%10;"
 333          "add    %12,%4;"
 334          "mov    %5,%12;"
 335          "pshufb %%xmm10,%%xmm8;"
 336          "or     %k2,%10;"
 337          "add    %4,%7;"
 338          "and    %k2,%12;"
 339          "paddd  %%xmm8,%%xmm0;"
 340          "and    %6,%10;"
 341          "add    %11,%4;"
 342          "pshufd $0x50,%%xmm0,%%xmm2;"
 343          "or     %12,%10;"
 344          "add    %10,%4;"
 345          "movdqa %%xmm2,%%xmm3;"
 346          "mov    %7,%10;"
 347          "ror    $0xe,%10;"
 348          "mov    %4,%11;"
 349          "movdqa %%xmm2,%%xmm5;"
 350          "ror    $0x9,%11;"
 351          "xor    %7,%10;"
 352          "mov    %8,%12;"
 353          "ror    $0x5,%10;"
 354          "psrlq  $0x11,%%xmm2;"
 355          "xor    %4,%11;"
 356          "xor    %9,%12;"
 357          "psrlq  $0x13,%%xmm3;"
 358          "xor    %7,%10;"
 359          "and    %7,%12;"
 360          "ror    $0xb,%11;"
 361          "psrld  $0xa,%%xmm5;"
 362          "xor    %4,%11;"
 363          "ror    $0x6,%10;"
 364          "xor    %9,%12;"
 365          "pxor   %%xmm3,%%xmm2;"
 366          "ror    $0x2,%11;"
 367          "add    %10,%12;"
 368          "add    12+%16,%12;"
 369          "pxor   %%xmm2,%%xmm5;"
 370          "mov    %4,%10;"
 371          "add    %12,%3;"
 372          "mov    %4,%12;"
 373          "pshufb %%xmm11,%%xmm5;"
 374          "or     %6,%10;"
 375          "add    %3,%k2;"
 376          "and    %6,%12;"
 377          "paddd  %%xmm0,%%xmm5;"
 378          "and    %5,%10;"
 379          "add    %11,%3;"
 380          "or     %12,%10;"
 381          "add    %10,%3;"
 382          "movdqa 0x20(%13),%%xmm9;"
 383          "paddd  %%xmm6,%%xmm9;"
 384          "movdqa %%xmm9,%16;"
 385          "movdqa %%xmm5,%%xmm0;"
 386          "mov    %k2,%10;"
 387          "ror    $0xe,%10;"
 388          "mov    %3,%11;"
 389          "palignr $0x4,%%xmm4,%%xmm0;"
 390          "ror    $0x9,%11;"
 391          "xor    %k2,%10;"
 392          "mov    %7,%12;"
 393          "ror    $0x5,%10;"
 394          "movdqa %%xmm7,%%xmm1;"
 395          "xor    %3,%11;"
 396          "xor    %8,%12;"
 397          "paddd  %%xmm6,%%xmm0;"
 398          "xor    %k2,%10;"
 399          "and    %k2,%12;"
 400          "ror    $0xb,%11;"
 401          "palignr $0x4,%%xmm6,%%xmm1;"
 402          "xor    %3,%11;"
 403          "ror    $0x6,%10;"
 404          "xor    %8,%12;"
 405          "movdqa %%xmm1,%%xmm2;"
 406          "ror    $0x2,%11;"
 407          "add    %10,%12;"
 408          "add    %16,%12;"
 409          "movdqa %%xmm1,%%xmm3;"
 410          "mov    %3,%10;"
 411          "add    %12,%9;"
 412          "mov    %3,%12;"
 413          "pslld  $0x19,%%xmm1;"
 414          "or     %5,%10;"
 415          "add    %9,%6;"
 416          "and    %5,%12;"
 417          "psrld  $0x7,%%xmm2;"
 418          "and    %4,%10;"
 419          "add    %11,%9;"
 420          "por    %%xmm2,%%xmm1;"
 421          "or     %12,%10;"
 422          "add    %10,%9;"
 423          "movdqa %%xmm3,%%xmm2;"
 424          "mov    %6,%10;"
 425          "mov    %9,%11;"
 426          "movdqa %%xmm3,%%xmm8;"
 427          "ror    $0xe,%10;"
 428          "xor    %6,%10;"
 429          "mov    %k2,%12;"
 430          "ror    $0x9,%11;"
 431          "pslld  $0xe,%%xmm3;"
 432          "xor    %9,%11;"
 433          "ror    $0x5,%10;"
 434          "xor    %7,%12;"
 435          "psrld  $0x12,%%xmm2;"
 436          "ror    $0xb,%11;"
 437          "xor    %6,%10;"
 438          "and    %6,%12;"
 439          "ror    $0x6,%10;"
 440          "pxor   %%xmm3,%%xmm1;"
 441          "xor    %9,%11;"
 442          "xor    %7,%12;"
 443          "psrld  $0x3,%%xmm8;"
 444          "add    %10,%12;"
 445          "add    4+%16,%12;"
 446          "ror    $0x2,%11;"
 447          "pxor   %%xmm2,%%xmm1;"
 448          "mov    %9,%10;"
 449          "add    %12,%8;"
 450          "mov    %9,%12;"
 451          "pxor   %%xmm8,%%xmm1;"
 452          "or     %4,%10;"
 453          "add    %8,%5;"
 454          "and    %4,%12;"
 455          "pshufd $0xfa,%%xmm5,%%xmm2;"
 456          "and    %3,%10;"
 457          "add    %11,%8;"
 458          "paddd  %%xmm1,%%xmm0;"
 459          "or     %12,%10;"
 460          "add    %10,%8;"
 461          "movdqa %%xmm2,%%xmm3;"
 462          "mov    %5,%10;"
 463          "mov    %8,%11;"
 464          "ror    $0xe,%10;"
 465          "movdqa %%xmm2,%%xmm8;"
 466          "xor    %5,%10;"
 467          "ror    $0x9,%11;"
 468          "mov    %6,%12;"
 469          "xor    %8,%11;"
 470          "ror    $0x5,%10;"
 471          "psrlq  $0x11,%%xmm2;"
 472          "xor    %k2,%12;"
 473          "psrlq  $0x13,%%xmm3;"
 474          "xor    %5,%10;"
 475          "and    %5,%12;"
 476          "psrld  $0xa,%%xmm8;"
 477          "ror    $0xb,%11;"
 478          "xor    %8,%11;"
 479          "xor    %k2,%12;"
 480          "ror    $0x6,%10;"
 481          "pxor   %%xmm3,%%xmm2;"
 482          "add    %10,%12;"
 483          "ror    $0x2,%11;"
 484          "add    8+%16,%12;"
 485          "pxor   %%xmm2,%%xmm8;"
 486          "mov    %8,%10;"
 487          "add    %12,%7;"
 488          "mov    %8,%12;"
 489          "pshufb %%xmm10,%%xmm8;"
 490          "or     %3,%10;"
 491          "add    %7,%4;"
 492          "and    %3,%12;"
 493          "paddd  %%xmm8,%%xmm0;"
 494          "and    %9,%10;"
 495          "add    %11,%7;"
 496          "pshufd $0x50,%%xmm0,%%xmm2;"
 497          "or     %12,%10;"
 498          "add    %10,%7;"
 499          "movdqa %%xmm2,%%xmm3;"
 500          "mov    %4,%10;"
 501          "ror    $0xe,%10;"
 502          "mov    %7,%11;"
 503          "movdqa %%xmm2,%%xmm6;"
 504          "ror    $0x9,%11;"
 505          "xor    %4,%10;"
 506          "mov    %5,%12;"
 507          "ror    $0x5,%10;"
 508          "psrlq  $0x11,%%xmm2;"
 509          "xor    %7,%11;"
 510          "xor    %6,%12;"
 511          "psrlq  $0x13,%%xmm3;"
 512          "xor    %4,%10;"
 513          "and    %4,%12;"
 514          "ror    $0xb,%11;"
 515          "psrld  $0xa,%%xmm6;"
 516          "xor    %7,%11;"
 517          "ror    $0x6,%10;"
 518          "xor    %6,%12;"
 519          "pxor   %%xmm3,%%xmm2;"
 520          "ror    $0x2,%11;"
 521          "add    %10,%12;"
 522          "add    12+%16,%12;"
 523          "pxor   %%xmm2,%%xmm6;"
 524          "mov    %7,%10;"
 525          "add    %12,%k2;"
 526          "mov    %7,%12;"
 527          "pshufb %%xmm11,%%xmm6;"
 528          "or     %9,%10;"
 529          "add    %k2,%3;"
 530          "and    %9,%12;"
 531          "paddd  %%xmm0,%%xmm6;"
 532          "and    %8,%10;"
 533          "add    %11,%k2;"
 534          "or     %12,%10;"
 535          "add    %10,%k2;"
 536          "movdqa 0x30(%13),%%xmm9;"
 537          "paddd  %%xmm7,%%xmm9;"
 538          "movdqa %%xmm9,%16;"
 539          "add    $0x40,%13;"
 540          "movdqa %%xmm6,%%xmm0;"
 541          "mov    %3,%10;"
 542          "ror    $0xe,%10;"
 543          "mov    %k2,%11;"
 544          "palignr $0x4,%%xmm5,%%xmm0;"
 545          "ror    $0x9,%11;"
 546          "xor    %3,%10;"
 547          "mov    %4,%12;"
 548          "ror    $0x5,%10;"
 549          "movdqa %%xmm4,%%xmm1;"
 550          "xor    %k2,%11;"
 551          "xor    %5,%12;"
 552          "paddd  %%xmm7,%%xmm0;"
 553          "xor    %3,%10;"
 554          "and    %3,%12;"
 555          "ror    $0xb,%11;"
 556          "palignr $0x4,%%xmm7,%%xmm1;"
 557          "xor    %k2,%11;"
 558          "ror    $0x6,%10;"
 559          "xor    %5,%12;"
 560          "movdqa %%xmm1,%%xmm2;"
 561          "ror    $0x2,%11;"
 562          "add    %10,%12;"
 563          "add    %16,%12;"
 564          "movdqa %%xmm1,%%xmm3;"
 565          "mov    %k2,%10;"
 566          "add    %12,%6;"
 567          "mov    %k2,%12;"
 568          "pslld  $0x19,%%xmm1;"
 569          "or     %8,%10;"
 570          "add    %6,%9;"
 571          "and    %8,%12;"
 572          "psrld  $0x7,%%xmm2;"
 573          "and    %7,%10;"
 574          "add    %11,%6;"
 575          "por    %%xmm2,%%xmm1;"
 576          "or     %12,%10;"
 577          "add    %10,%6;"
 578          "movdqa %%xmm3,%%xmm2;"
 579          "mov    %9,%10;"
 580          "mov    %6,%11;"
 581          "movdqa %%xmm3,%%xmm8;"
 582          "ror    $0xe,%10;"
 583          "xor    %9,%10;"
 584          "mov    %3,%12;"
 585          "ror    $0x9,%11;"
 586          "pslld  $0xe,%%xmm3;"
 587          "xor    %6,%11;"
 588          "ror    $0x5,%10;"
 589          "xor    %4,%12;"
 590          "psrld  $0x12,%%xmm2;"
 591          "ror    $0xb,%11;"
 592          "xor    %9,%10;"
 593          "and    %9,%12;"
 594          "ror    $0x6,%10;"
 595          "pxor   %%xmm3,%%xmm1;"
 596          "xor    %6,%11;"
 597          "xor    %4,%12;"
 598          "psrld  $0x3,%%xmm8;"
 599          "add    %10,%12;"
 600          "add    4+%16,%12;"
 601          "ror    $0x2,%11;"
 602          "pxor   %%xmm2,%%xmm1;"
 603          "mov    %6,%10;"
 604          "add    %12,%5;"
 605          "mov    %6,%12;"
 606          "pxor   %%xmm8,%%xmm1;"
 607          "or     %7,%10;"
 608          "add    %5,%8;"
 609          "and    %7,%12;"
 610          "pshufd $0xfa,%%xmm6,%%xmm2;"
 611          "and    %k2,%10;"
 612          "add    %11,%5;"
 613          "paddd  %%xmm1,%%xmm0;"
 614          "or     %12,%10;"
 615          "add    %10,%5;"
 616          "movdqa %%xmm2,%%xmm3;"
 617          "mov    %8,%10;"
 618          "mov    %5,%11;"
 619          "ror    $0xe,%10;"
 620          "movdqa %%xmm2,%%xmm8;"
 621          "xor    %8,%10;"
 622          "ror    $0x9,%11;"
 623          "mov    %9,%12;"
 624          "xor    %5,%11;"
 625          "ror    $0x5,%10;"
 626          "psrlq  $0x11,%%xmm2;"
 627          "xor    %3,%12;"
 628          "psrlq  $0x13,%%xmm3;"
 629          "xor    %8,%10;"
 630          "and    %8,%12;"
 631          "psrld  $0xa,%%xmm8;"
 632          "ror    $0xb,%11;"
 633          "xor    %5,%11;"
 634          "xor    %3,%12;"
 635          "ror    $0x6,%10;"
 636          "pxor   %%xmm3,%%xmm2;"
 637          "add    %10,%12;"
 638          "ror    $0x2,%11;"
 639          "add    8+%16,%12;"
 640          "pxor   %%xmm2,%%xmm8;"
 641          "mov    %5,%10;"
 642          "add    %12,%4;"
 643          "mov    %5,%12;"
 644          "pshufb %%xmm10,%%xmm8;"
 645          "or     %k2,%10;"
 646          "add    %4,%7;"
 647          "and    %k2,%12;"
 648          "paddd  %%xmm8,%%xmm0;"
 649          "and    %6,%10;"
 650          "add    %11,%4;"
 651          "pshufd $0x50,%%xmm0,%%xmm2;"
 652          "or     %12,%10;"
 653          "add    %10,%4;"
 654          "movdqa %%xmm2,%%xmm3;"
 655          "mov    %7,%10;"
 656          "ror    $0xe,%10;"
 657          "mov    %4,%11;"
 658          "movdqa %%xmm2,%%xmm7;"
 659          "ror    $0x9,%11;"
 660          "xor    %7,%10;"
 661          "mov    %8,%12;"
 662          "ror    $0x5,%10;"
 663          "psrlq  $0x11,%%xmm2;"
 664          "xor    %4,%11;"
 665          "xor    %9,%12;"
 666          "psrlq  $0x13,%%xmm3;"
 667          "xor    %7,%10;"
 668          "and    %7,%12;"
 669          "ror    $0xb,%11;"
 670          "psrld  $0xa,%%xmm7;"
 671          "xor    %4,%11;"
 672          "ror    $0x6,%10;"
 673          "xor    %9,%12;"
 674          "pxor   %%xmm3,%%xmm2;"
 675          "ror    $0x2,%11;"
 676          "add    %10,%12;"
 677          "add    12+%16,%12;"
 678          "pxor   %%xmm2,%%xmm7;"
 679          "mov    %4,%10;"
 680          "add    %12,%3;"
 681          "mov    %4,%12;"
 682          "pshufb %%xmm11,%%xmm7;"
 683          "or     %6,%10;"
 684          "add    %3,%k2;"
 685          "and    %6,%12;"
 686          "paddd  %%xmm0,%%xmm7;"
 687          "and    %5,%10;"
 688          "add    %11,%3;"
 689          "or     %12,%10;"
 690          "add    %10,%3;"
 691          "sub    $0x1,%1;"
 692          "jne    Lloop1_%=;"
 693          "mov    $0x2,%1;"
 694  
 695          "Lloop2_%=:"
 696          "paddd  0x0(%13),%%xmm4;"
 697          "movdqa %%xmm4,%16;"
 698          "mov    %k2,%10;"
 699          "ror    $0xe,%10;"
 700          "mov    %3,%11;"
 701          "xor    %k2,%10;"
 702          "ror    $0x9,%11;"
 703          "mov    %7,%12;"
 704          "xor    %3,%11;"
 705          "ror    $0x5,%10;"
 706          "xor    %8,%12;"
 707          "xor    %k2,%10;"
 708          "ror    $0xb,%11;"
 709          "and    %k2,%12;"
 710          "xor    %3,%11;"
 711          "ror    $0x6,%10;"
 712          "xor    %8,%12;"
 713          "add    %10,%12;"
 714          "ror    $0x2,%11;"
 715          "add    %16,%12;"
 716          "mov    %3,%10;"
 717          "add    %12,%9;"
 718          "mov    %3,%12;"
 719          "or     %5,%10;"
 720          "add    %9,%6;"
 721          "and    %5,%12;"
 722          "and    %4,%10;"
 723          "add    %11,%9;"
 724          "or     %12,%10;"
 725          "add    %10,%9;"
 726          "mov    %6,%10;"
 727          "ror    $0xe,%10;"
 728          "mov    %9,%11;"
 729          "xor    %6,%10;"
 730          "ror    $0x9,%11;"
 731          "mov    %k2,%12;"
 732          "xor    %9,%11;"
 733          "ror    $0x5,%10;"
 734          "xor    %7,%12;"
 735          "xor    %6,%10;"
 736          "ror    $0xb,%11;"
 737          "and    %6,%12;"
 738          "xor    %9,%11;"
 739          "ror    $0x6,%10;"
 740          "xor    %7,%12;"
 741          "add    %10,%12;"
 742          "ror    $0x2,%11;"
 743          "add    4+%16,%12;"
 744          "mov    %9,%10;"
 745          "add    %12,%8;"
 746          "mov    %9,%12;"
 747          "or     %4,%10;"
 748          "add    %8,%5;"
 749          "and    %4,%12;"
 750          "and    %3,%10;"
 751          "add    %11,%8;"
 752          "or     %12,%10;"
 753          "add    %10,%8;"
 754          "mov    %5,%10;"
 755          "ror    $0xe,%10;"
 756          "mov    %8,%11;"
 757          "xor    %5,%10;"
 758          "ror    $0x9,%11;"
 759          "mov    %6,%12;"
 760          "xor    %8,%11;"
 761          "ror    $0x5,%10;"
 762          "xor    %k2,%12;"
 763          "xor    %5,%10;"
 764          "ror    $0xb,%11;"
 765          "and    %5,%12;"
 766          "xor    %8,%11;"
 767          "ror    $0x6,%10;"
 768          "xor    %k2,%12;"
 769          "add    %10,%12;"
 770          "ror    $0x2,%11;"
 771          "add    8+%16,%12;"
 772          "mov    %8,%10;"
 773          "add    %12,%7;"
 774          "mov    %8,%12;"
 775          "or     %3,%10;"
 776          "add    %7,%4;"
 777          "and    %3,%12;"
 778          "and    %9,%10;"
 779          "add    %11,%7;"
 780          "or     %12,%10;"
 781          "add    %10,%7;"
 782          "mov    %4,%10;"
 783          "ror    $0xe,%10;"
 784          "mov    %7,%11;"
 785          "xor    %4,%10;"
 786          "ror    $0x9,%11;"
 787          "mov    %5,%12;"
 788          "xor    %7,%11;"
 789          "ror    $0x5,%10;"
 790          "xor    %6,%12;"
 791          "xor    %4,%10;"
 792          "ror    $0xb,%11;"
 793          "and    %4,%12;"
 794          "xor    %7,%11;"
 795          "ror    $0x6,%10;"
 796          "xor    %6,%12;"
 797          "add    %10,%12;"
 798          "ror    $0x2,%11;"
 799          "add    12+%16,%12;"
 800          "mov    %7,%10;"
 801          "add    %12,%k2;"
 802          "mov    %7,%12;"
 803          "or     %9,%10;"
 804          "add    %k2,%3;"
 805          "and    %9,%12;"
 806          "and    %8,%10;"
 807          "add    %11,%k2;"
 808          "or     %12,%10;"
 809          "add    %10,%k2;"
 810          "paddd  0x10(%13),%%xmm5;"
 811          "movdqa %%xmm5,%16;"
 812          "add    $0x20,%13;"
 813          "mov    %3,%10;"
 814          "ror    $0xe,%10;"
 815          "mov    %k2,%11;"
 816          "xor    %3,%10;"
 817          "ror    $0x9,%11;"
 818          "mov    %4,%12;"
 819          "xor    %k2,%11;"
 820          "ror    $0x5,%10;"
 821          "xor    %5,%12;"
 822          "xor    %3,%10;"
 823          "ror    $0xb,%11;"
 824          "and    %3,%12;"
 825          "xor    %k2,%11;"
 826          "ror    $0x6,%10;"
 827          "xor    %5,%12;"
 828          "add    %10,%12;"
 829          "ror    $0x2,%11;"
 830          "add    %16,%12;"
 831          "mov    %k2,%10;"
 832          "add    %12,%6;"
 833          "mov    %k2,%12;"
 834          "or     %8,%10;"
 835          "add    %6,%9;"
 836          "and    %8,%12;"
 837          "and    %7,%10;"
 838          "add    %11,%6;"
 839          "or     %12,%10;"
 840          "add    %10,%6;"
 841          "mov    %9,%10;"
 842          "ror    $0xe,%10;"
 843          "mov    %6,%11;"
 844          "xor    %9,%10;"
 845          "ror    $0x9,%11;"
 846          "mov    %3,%12;"
 847          "xor    %6,%11;"
 848          "ror    $0x5,%10;"
 849          "xor    %4,%12;"
 850          "xor    %9,%10;"
 851          "ror    $0xb,%11;"
 852          "and    %9,%12;"
 853          "xor    %6,%11;"
 854          "ror    $0x6,%10;"
 855          "xor    %4,%12;"
 856          "add    %10,%12;"
 857          "ror    $0x2,%11;"
 858          "add    4+%16,%12;"
 859          "mov    %6,%10;"
 860          "add    %12,%5;"
 861          "mov    %6,%12;"
 862          "or     %7,%10;"
 863          "add    %5,%8;"
 864          "and    %7,%12;"
 865          "and    %k2,%10;"
 866          "add    %11,%5;"
 867          "or     %12,%10;"
 868          "add    %10,%5;"
 869          "mov    %8,%10;"
 870          "ror    $0xe,%10;"
 871          "mov    %5,%11;"
 872          "xor    %8,%10;"
 873          "ror    $0x9,%11;"
 874          "mov    %9,%12;"
 875          "xor    %5,%11;"
 876          "ror    $0x5,%10;"
 877          "xor    %3,%12;"
 878          "xor    %8,%10;"
 879          "ror    $0xb,%11;"
 880          "and    %8,%12;"
 881          "xor    %5,%11;"
 882          "ror    $0x6,%10;"
 883          "xor    %3,%12;"
 884          "add    %10,%12;"
 885          "ror    $0x2,%11;"
 886          "add    8+%16,%12;"
 887          "mov    %5,%10;"
 888          "add    %12,%4;"
 889          "mov    %5,%12;"
 890          "or     %k2,%10;"
 891          "add    %4,%7;"
 892          "and    %k2,%12;"
 893          "and    %6,%10;"
 894          "add    %11,%4;"
 895          "or     %12,%10;"
 896          "add    %10,%4;"
 897          "mov    %7,%10;"
 898          "ror    $0xe,%10;"
 899          "mov    %4,%11;"
 900          "xor    %7,%10;"
 901          "ror    $0x9,%11;"
 902          "mov    %8,%12;"
 903          "xor    %4,%11;"
 904          "ror    $0x5,%10;"
 905          "xor    %9,%12;"
 906          "xor    %7,%10;"
 907          "ror    $0xb,%11;"
 908          "and    %7,%12;"
 909          "xor    %4,%11;"
 910          "ror    $0x6,%10;"
 911          "xor    %9,%12;"
 912          "add    %10,%12;"
 913          "ror    $0x2,%11;"
 914          "add    12+%16,%12;"
 915          "mov    %4,%10;"
 916          "add    %12,%3;"
 917          "mov    %4,%12;"
 918          "or     %6,%10;"
 919          "add    %3,%k2;"
 920          "and    %6,%12;"
 921          "and    %5,%10;"
 922          "add    %11,%3;"
 923          "or     %12,%10;"
 924          "add    %10,%3;"
 925          "movdqa %%xmm6,%%xmm4;"
 926          "movdqa %%xmm7,%%xmm5;"
 927          "sub    $0x1,%1;"
 928          "jne    Lloop2_%=;"
 929          "add    (%0),%3;"
 930          "mov    %3,(%0);"
 931          "add    0x4(%0),%4;"
 932          "mov    %4,0x4(%0);"
 933          "add    0x8(%0),%5;"
 934          "mov    %5,0x8(%0);"
 935          "add    0xc(%0),%6;"
 936          "mov    %6,0xc(%0);"
 937          "add    0x10(%0),%k2;"
 938          "mov    %k2,0x10(%0);"
 939          "add    0x14(%0),%7;"
 940          "mov    %7,0x14(%0);"
 941          "add    0x18(%0),%8;"
 942          "mov    %8,0x18(%0);"
 943          "add    0x1c(%0),%9;"
 944          "mov    %9,0x1c(%0);"
 945          "mov    %15,%1;"
 946          "add    $0x40,%1;"
 947          "cmp    %14,%1;"
 948          "jne    Lloop0_%=;"
 949  
 950          "Ldone_hash_%=:"
 951  
 952          : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
 953          : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
 954          : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
 955     );
 956  }
 957  }
 958  
 959  /*
 960  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 961  ; Copyright (c) 2012, Intel Corporation 
 962  ; 
 963  ; All rights reserved. 
 964  ; 
 965  ; Redistribution and use in source and binary forms, with or without
 966  ; modification, are permitted provided that the following conditions are
 967  ; met: 
 968  ; 
 969  ; * Redistributions of source code must retain the above copyright
 970  ;   notice, this list of conditions and the following disclaimer.  
 971  ; 
 972  ; * Redistributions in binary form must reproduce the above copyright
 973  ;   notice, this list of conditions and the following disclaimer in the
 974  ;   documentation and/or other materials provided with the
 975  ;   distribution. 
 976  ; 
 977  ; * Neither the name of the Intel Corporation nor the names of its
 978  ;   contributors may be used to endorse or promote products derived from
 979  ;   this software without specific prior written permission. 
 980  ; 
 981  ; 
 982  ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 983  ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 984  ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 985  ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 986  ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 987  ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 988  ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 989  ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 990  ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 991  ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 992  ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 993  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 994  ;
 995  ; Example YASM command lines:
 996  ; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
 997  ; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
 998  ;
 999  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1000  ;
1001  ; This code is described in an Intel White-Paper:
1002  ; "Fast SHA-256 Implementations on Intel Architecture Processors"
1003  ;
1004  ; To find it, surf to https://www.intel.com/p/en_US/embedded
1005  ; and search for that title.
1006  ; The paper is expected to be released roughly at the end of April, 2012
1007  ;
1008  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1009  ; This code schedules 1 blocks at a time, with 4 lanes per block
1010  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1011  
1012  %define	MOVDQ movdqu ;; assume buffers not aligned 
1013  
1014  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1015  
1016  ; addm [mem], reg
1017  ; Add reg to mem using reg-mem add and store
1018  %macro addm 2
1019      add	%2, %1
1020      mov	%1, %2
1021  %endm
1022  
1023  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1024  
1025  ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1026  ; Load xmm with mem and byte swap each dword
1027  %macro COPY_XMM_AND_BSWAP 3
1028      MOVDQ %1, %2
1029      pshufb %1, %3
1030  %endmacro
1031  
1032  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1033  
1034  %define X0 xmm4
1035  %define X1 xmm5
1036  %define X2 xmm6
1037  %define X3 xmm7
1038  
1039  %define XTMP0 xmm0
1040  %define XTMP1 xmm1
1041  %define XTMP2 xmm2
1042  %define XTMP3 xmm3
1043  %define XTMP4 xmm8
1044  %define XFER  xmm9
1045  
1046  %define SHUF_00BA	xmm10 ; shuffle xBxA -> 00BA
1047  %define SHUF_DC00	xmm11 ; shuffle xDxC -> DC00
1048  %define BYTE_FLIP_MASK	xmm12
1049      
1050  %ifdef LINUX
1051  %define NUM_BLKS rdx	; 3rd arg
1052  %define CTX	rsi	; 2nd arg
1053  %define INP	rdi	; 1st arg
1054  
1055  %define SRND	rdi	; clobbers INP
1056  %define c	ecx
1057  %define d 	r8d
1058  %define e 	edx
1059  %else
1060  %define NUM_BLKS r8	; 3rd arg
1061  %define CTX	rdx 	; 2nd arg
1062  %define INP	rcx 	; 1st arg
1063  
1064  %define SRND	rcx	; clobbers INP
1065  %define c 	edi 
1066  %define d	esi 
1067  %define e 	r8d
1068      
1069  %endif
1070  %define TBL	rbp
1071  %define a eax
1072  %define b ebx
1073  
1074  %define f r9d
1075  %define g r10d
1076  %define h r11d
1077  
1078  %define y0 r13d
1079  %define y1 r14d
1080  %define y2 r15d
1081  
1082  
1083  
1084  _INP_END_SIZE	equ 8
1085  _INP_SIZE	equ 8
1086  _XFER_SIZE	equ 8
1087  %ifdef LINUX
1088  _XMM_SAVE_SIZE	equ 0
1089  %else
1090  _XMM_SAVE_SIZE	equ 7*16
1091  %endif
1092  ; STACK_SIZE plus pushes must be an odd multiple of 8
1093  _ALIGN_SIZE	equ 8
1094  
1095  _INP_END	equ 0
1096  _INP		equ _INP_END  + _INP_END_SIZE
1097  _XFER		equ _INP      + _INP_SIZE
1098  _XMM_SAVE	equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
1099  STACK_SIZE	equ _XMM_SAVE + _XMM_SAVE_SIZE
1100  
1101  ; rotate_Xs
1102  ; Rotate values of symbols X0...X3
1103  %macro rotate_Xs 0
1104  %xdefine X_ X0
1105  %xdefine X0 X1
1106  %xdefine X1 X2
1107  %xdefine X2 X3
1108  %xdefine X3 X_
1109  %endm
1110  
1111  ; ROTATE_ARGS
1112  ; Rotate values of symbols a...h
1113  %macro ROTATE_ARGS 0
1114  %xdefine TMP_ h
1115  %xdefine h g
1116  %xdefine g f
1117  %xdefine f e
1118  %xdefine e d
1119  %xdefine d c
1120  %xdefine c b
1121  %xdefine b a
1122  %xdefine a TMP_
1123  %endm
1124  
1125  %macro FOUR_ROUNDS_AND_SCHED 0
1126  	;; compute s0 four at a time and s1 two at a time
1127  	;; compute W[-16] + W[-7] 4 at a time
1128  	movdqa	XTMP0, X3
1129      mov	y0, e		; y0 = e
1130      ror	y0, (25-11)	; y0 = e >> (25-11)
1131      mov	y1, a		; y1 = a
1132  	palignr	XTMP0, X2, 4	; XTMP0 = W[-7]
1133      ror	y1, (22-13)	; y1 = a >> (22-13)
1134      xor	y0, e		; y0 = e ^ (e >> (25-11))
1135      mov	y2, f		; y2 = f
1136      ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
1137  	movdqa	XTMP1, X1
1138      xor	y1, a		; y1 = a ^ (a >> (22-13)
1139      xor	y2, g		; y2 = f^g
1140  	paddd	XTMP0, X0	; XTMP0 = W[-7] + W[-16]
1141      xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1142      and	y2, e		; y2 = (f^g)&e
1143      ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
1144  	;; compute s0
1145  	palignr	XTMP1, X0, 4	; XTMP1 = W[-15]
1146      xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1147      ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1148      xor	y2, g		; y2 = CH = ((f^g)&e)^g
1149  	movdqa	XTMP2, XTMP1	; XTMP2 = W[-15]
1150      ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1151      add	y2, y0		; y2 = S1 + CH
1152      add	y2, [rsp + _XFER + 0*4]	; y2 = k + w + S1 + CH
1153  	movdqa	XTMP3, XTMP1	; XTMP3 = W[-15]
1154      mov	y0, a		; y0 = a
1155      add	h, y2		; h = h + S1 + CH + k + w
1156      mov	y2, a		; y2 = a
1157  	pslld	XTMP1, (32-7)
1158      or	y0, c		; y0 = a|c
1159      add	d, h		; d = d + h + S1 + CH + k + w
1160      and	y2, c		; y2 = a&c
1161  	psrld	XTMP2, 7
1162      and	y0, b		; y0 = (a|c)&b
1163      add	h, y1		; h = h + S1 + CH + k + w + S0
1164  	por	XTMP1, XTMP2	; XTMP1 = W[-15] ror 7
1165      or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
1166      add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
1167  
1168  ROTATE_ARGS
1169  	movdqa	XTMP2, XTMP3	; XTMP2 = W[-15]
1170      mov	y0, e		; y0 = e
1171      mov	y1, a		; y1 = a
1172  	movdqa	XTMP4, XTMP3	; XTMP4 = W[-15]
1173      ror	y0, (25-11)	; y0 = e >> (25-11)
1174      xor	y0, e		; y0 = e ^ (e >> (25-11))
1175      mov	y2, f		; y2 = f
1176      ror	y1, (22-13)	; y1 = a >> (22-13)
1177  	pslld	XTMP3, (32-18)
1178      xor	y1, a		; y1 = a ^ (a >> (22-13)
1179      ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
1180      xor	y2, g		; y2 = f^g
1181  	psrld	XTMP2, 18
1182      ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
1183      xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1184      and	y2, e		; y2 = (f^g)&e
1185      ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1186  	pxor	XTMP1, XTMP3
1187      xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1188      xor	y2, g		; y2 = CH = ((f^g)&e)^g
1189  	psrld	XTMP4, 3	; XTMP4 = W[-15] >> 3
1190      add	y2, y0		; y2 = S1 + CH
1191      add	y2, [rsp + _XFER + 1*4]	; y2 = k + w + S1 + CH
1192      ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1193  	pxor	XTMP1, XTMP2	; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1194      mov	y0, a		; y0 = a
1195      add	h, y2		; h = h + S1 + CH + k + w
1196      mov	y2, a		; y2 = a
1197  	pxor	XTMP1, XTMP4	; XTMP1 = s0
1198      or	y0, c		; y0 = a|c
1199      add	d, h		; d = d + h + S1 + CH + k + w
1200      and	y2, c		; y2 = a&c
1201  	;; compute low s1
1202  	pshufd	XTMP2, X3, 11111010b	; XTMP2 = W[-2] {BBAA}
1203      and	y0, b		; y0 = (a|c)&b
1204      add	h, y1		; h = h + S1 + CH + k + w + S0
1205  	paddd	XTMP0, XTMP1	; XTMP0 = W[-16] + W[-7] + s0
1206      or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
1207      add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
1208  
1209  ROTATE_ARGS
1210  	movdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {BBAA}
1211      mov	y0, e		; y0 = e
1212      mov	y1, a		; y1 = a
1213      ror	y0, (25-11)	; y0 = e >> (25-11)
1214  	movdqa	XTMP4, XTMP2	; XTMP4 = W[-2] {BBAA}
1215      xor	y0, e		; y0 = e ^ (e >> (25-11))
1216      ror	y1, (22-13)	; y1 = a >> (22-13)
1217      mov	y2, f		; y2 = f
1218      xor	y1, a		; y1 = a ^ (a >> (22-13)
1219      ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
1220  	psrlq	XTMP2, 17	; XTMP2 = W[-2] ror 17 {xBxA}
1221      xor	y2, g		; y2 = f^g
1222  	psrlq	XTMP3, 19	; XTMP3 = W[-2] ror 19 {xBxA}
1223      xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1224      and	y2, e		; y2 = (f^g)&e
1225  	psrld	XTMP4, 10	; XTMP4 = W[-2] >> 10 {BBAA}
1226      ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
1227      xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1228      xor	y2, g		; y2 = CH = ((f^g)&e)^g
1229      ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1230  	pxor	XTMP2, XTMP3
1231      add	y2, y0		; y2 = S1 + CH
1232      ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1233      add	y2, [rsp + _XFER + 2*4]	; y2 = k + w + S1 + CH
1234  	pxor	XTMP4, XTMP2	; XTMP4 = s1 {xBxA}
1235      mov	y0, a		; y0 = a
1236      add	h, y2		; h = h + S1 + CH + k + w
1237      mov	y2, a		; y2 = a
1238  	pshufb	XTMP4, SHUF_00BA	; XTMP4 = s1 {00BA}
1239      or	y0, c		; y0 = a|c
1240      add	d, h		; d = d + h + S1 + CH + k + w
1241      and	y2, c		; y2 = a&c
1242  	paddd	XTMP0, XTMP4	; XTMP0 = {..., ..., W[1], W[0]}
1243      and	y0, b		; y0 = (a|c)&b
1244      add	h, y1		; h = h + S1 + CH + k + w + S0
1245  	;; compute high s1
1246  	pshufd	XTMP2, XTMP0, 01010000b	; XTMP2 = W[-2] {DDCC}
1247      or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
1248      add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
1249  
1250  ROTATE_ARGS
1251  	movdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {DDCC}
1252      mov	y0, e		; y0 = e
1253      ror	y0, (25-11)	; y0 = e >> (25-11)
1254      mov	y1, a		; y1 = a
1255  	movdqa	X0,    XTMP2	; X0    = W[-2] {DDCC}
1256      ror	y1, (22-13)	; y1 = a >> (22-13)
1257      xor	y0, e		; y0 = e ^ (e >> (25-11))
1258      mov	y2, f		; y2 = f
1259      ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
1260  	psrlq	XTMP2, 17	; XTMP2 = W[-2] ror 17 {xDxC}
1261      xor	y1, a		; y1 = a ^ (a >> (22-13)
1262      xor	y2, g		; y2 = f^g
1263  	psrlq	XTMP3, 19	; XTMP3 = W[-2] ror 19 {xDxC}
1264      xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1265      and	y2, e		; y2 = (f^g)&e
1266      ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
1267  	psrld	X0,    10	; X0 = W[-2] >> 10 {DDCC}
1268      xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1269      ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1270      xor	y2, g		; y2 = CH = ((f^g)&e)^g
1271  	pxor	XTMP2, XTMP3
1272      ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1273      add	y2, y0		; y2 = S1 + CH
1274      add	y2, [rsp + _XFER + 3*4]	; y2 = k + w + S1 + CH
1275  	pxor	X0, XTMP2	; X0 = s1 {xDxC}
1276      mov	y0, a		; y0 = a
1277      add	h, y2		; h = h + S1 + CH + k + w
1278      mov	y2, a		; y2 = a
1279  	pshufb	X0, SHUF_DC00	; X0 = s1 {DC00}
1280      or	y0, c		; y0 = a|c
1281      add	d, h		; d = d + h + S1 + CH + k + w
1282      and	y2, c		; y2 = a&c
1283  	paddd	X0, XTMP0	; X0 = {W[3], W[2], W[1], W[0]}
1284      and	y0, b		; y0 = (a|c)&b
1285      add	h, y1		; h = h + S1 + CH + k + w + S0
1286      or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
1287      add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
1288  
1289  ROTATE_ARGS
1290  rotate_Xs
1291  %endm
1292  
1293  ;; input is [rsp + _XFER + %1 * 4]
1294  %macro DO_ROUND 1
1295      mov	y0, e		; y0 = e
1296      ror	y0, (25-11)	; y0 = e >> (25-11)
1297      mov	y1, a		; y1 = a
1298      xor	y0, e		; y0 = e ^ (e >> (25-11))
1299      ror	y1, (22-13)	; y1 = a >> (22-13)
1300      mov	y2, f		; y2 = f
1301      xor	y1, a		; y1 = a ^ (a >> (22-13)
1302      ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
1303      xor	y2, g		; y2 = f^g
1304      xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1305      ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
1306      and	y2, e		; y2 = (f^g)&e
1307      xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1308      ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1309      xor	y2, g		; y2 = CH = ((f^g)&e)^g
1310      add	y2, y0		; y2 = S1 + CH
1311      ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1312      add	y2, [rsp + _XFER + %1 * 4]	; y2 = k + w + S1 + CH
1313      mov	y0, a		; y0 = a
1314      add	h, y2		; h = h + S1 + CH + k + w
1315      mov	y2, a		; y2 = a
1316      or	y0, c		; y0 = a|c
1317      add	d, h		; d = d + h + S1 + CH + k + w
1318      and	y2, c		; y2 = a&c
1319      and	y0, b		; y0 = (a|c)&b
1320      add	h, y1		; h = h + S1 + CH + k + w + S0
1321      or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
1322      add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
1323      ROTATE_ARGS
1324  %endm
1325  
1326  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1327  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1328  ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1329  ;; arg 1 : pointer to input data
1330  ;; arg 2 : pointer to digest
1331  ;; arg 3 : Num blocks
1332  section .text
1333  global sha256_sse4
1334  align 32
1335  sha256_sse4:
1336      push	rbx
1337  %ifndef LINUX
1338      push	rsi
1339      push	rdi
1340  %endif
1341      push	rbp
1342      push	r13
1343      push	r14
1344      push	r15
1345  
1346      sub	rsp,STACK_SIZE
1347  %ifndef LINUX
1348      movdqa	[rsp + _XMM_SAVE + 0*16],xmm6	
1349      movdqa	[rsp + _XMM_SAVE + 1*16],xmm7
1350      movdqa	[rsp + _XMM_SAVE + 2*16],xmm8	
1351      movdqa	[rsp + _XMM_SAVE + 3*16],xmm9	
1352      movdqa	[rsp + _XMM_SAVE + 4*16],xmm10
1353      movdqa	[rsp + _XMM_SAVE + 5*16],xmm11
1354      movdqa	[rsp + _XMM_SAVE + 6*16],xmm12
1355  %endif
1356  
1357      shl	NUM_BLKS, 6	; convert to bytes
1358      jz	done_hash
1359      add	NUM_BLKS, INP	; pointer to end of data
1360      mov	[rsp + _INP_END], NUM_BLKS
1361  
1362      ;; load initial digest
1363      mov	a,[4*0 + CTX]
1364      mov	b,[4*1 + CTX]
1365      mov	c,[4*2 + CTX]
1366      mov	d,[4*3 + CTX]
1367      mov	e,[4*4 + CTX]
1368      mov	f,[4*5 + CTX]
1369      mov	g,[4*6 + CTX]
1370      mov	h,[4*7 + CTX]
1371  
1372      movdqa	BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1373      movdqa	SHUF_00BA, [_SHUF_00BA wrt rip]
1374      movdqa	SHUF_DC00, [_SHUF_DC00 wrt rip]
1375  
1376  loop0:
1377      lea	TBL,[K256 wrt rip]
1378  
1379      ;; byte swap first 16 dwords
1380      COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
1381      COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
1382      COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
1383      COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
1384      
1385      mov	[rsp + _INP], INP
1386  
1387      ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1388      mov	SRND, 3
1389  align 16
1390  loop1:
1391      movdqa	XFER, [TBL + 0*16]
1392      paddd	XFER, X0
1393      movdqa	[rsp + _XFER], XFER
1394      FOUR_ROUNDS_AND_SCHED
1395  
1396      movdqa	XFER, [TBL + 1*16]
1397      paddd	XFER, X0
1398      movdqa	[rsp + _XFER], XFER
1399      FOUR_ROUNDS_AND_SCHED
1400  
1401      movdqa	XFER, [TBL + 2*16]
1402      paddd	XFER, X0
1403      movdqa	[rsp + _XFER], XFER
1404      FOUR_ROUNDS_AND_SCHED
1405  
1406      movdqa	XFER, [TBL + 3*16]
1407      paddd	XFER, X0
1408      movdqa	[rsp + _XFER], XFER
1409      add	TBL, 4*16
1410      FOUR_ROUNDS_AND_SCHED
1411  
1412      sub	SRND, 1
1413      jne	loop1
1414  
1415      mov	SRND, 2
1416  loop2:
1417      paddd	X0, [TBL + 0*16]
1418      movdqa	[rsp + _XFER], X0
1419      DO_ROUND	0
1420      DO_ROUND	1
1421      DO_ROUND	2
1422      DO_ROUND	3
1423      paddd	X1, [TBL + 1*16]
1424      movdqa	[rsp + _XFER], X1
1425      add	TBL, 2*16
1426      DO_ROUND	0
1427      DO_ROUND	1
1428      DO_ROUND	2
1429      DO_ROUND	3
1430  
1431      movdqa	X0, X2
1432      movdqa	X1, X3
1433  
1434      sub	SRND, 1
1435      jne	loop2
1436  
1437      addm	[4*0 + CTX],a
1438      addm	[4*1 + CTX],b
1439      addm	[4*2 + CTX],c
1440      addm	[4*3 + CTX],d
1441      addm	[4*4 + CTX],e
1442      addm	[4*5 + CTX],f
1443      addm	[4*6 + CTX],g
1444      addm	[4*7 + CTX],h
1445  
1446      mov	INP, [rsp + _INP]
1447      add	INP, 64
1448      cmp	INP, [rsp + _INP_END]
1449      jne	loop0
1450  
1451  done_hash:
1452  %ifndef LINUX
1453      movdqa	xmm6,[rsp + _XMM_SAVE + 0*16]
1454      movdqa	xmm7,[rsp + _XMM_SAVE + 1*16]
1455      movdqa	xmm8,[rsp + _XMM_SAVE + 2*16]
1456      movdqa	xmm9,[rsp + _XMM_SAVE + 3*16]
1457      movdqa	xmm10,[rsp + _XMM_SAVE + 4*16]
1458      movdqa	xmm11,[rsp + _XMM_SAVE + 5*16]
1459      movdqa	xmm12,[rsp + _XMM_SAVE + 6*16]
1460  %endif
1461  
1462      add	rsp, STACK_SIZE
1463  
1464      pop	r15
1465      pop	r14
1466      pop	r13
1467      pop	rbp
1468  %ifndef LINUX
1469      pop	rdi
1470      pop	rsi
1471  %endif
1472      pop	rbx
1473  
1474      ret	
1475      
1476  
1477  section .data
1478  align 64
1479  K256:
1480      dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1481      dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1482      dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1483      dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1484      dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1485      dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1486      dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1487      dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1488      dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1489      dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1490      dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1491      dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1492      dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1493      dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1494      dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1495      dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1496  
1497  PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1498  
1499  ; shuffle xBxA -> 00BA
1500  _SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1501  
1502  ; shuffle xDxC -> DC00
1503  _SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1504  */
1505  
1506  #endif