sha256_x86_shani.cpp
1 // Copyright (c) 2018-2022 The Bitcoin Core developers 2 // Distributed under the MIT software license, see the accompanying 3 // file COPYING or http://www.opensource.org/licenses/mit-license.php. 4 // 5 // Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-x86.c, 6 // Written and placed in public domain by Jeffrey Walton. 7 // Based on code from Intel, and by Sean Gulley for the miTLS project. 8 9 #ifdef ENABLE_X86_SHANI 10 11 #include <stdint.h> 12 #include <immintrin.h> 13 14 #include <attributes.h> 15 16 namespace { 17 18 alignas(__m128i) const uint8_t MASK[16] = {0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04, 0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c}; 19 alignas(__m128i) const uint8_t INIT0[16] = {0x8c, 0x68, 0x05, 0x9b, 0x7f, 0x52, 0x0e, 0x51, 0x85, 0xae, 0x67, 0xbb, 0x67, 0xe6, 0x09, 0x6a}; 20 alignas(__m128i) const uint8_t INIT1[16] = {0x19, 0xcd, 0xe0, 0x5b, 0xab, 0xd9, 0x83, 0x1f, 0x3a, 0xf5, 0x4f, 0xa5, 0x72, 0xf3, 0x6e, 0x3c}; 21 22 void ALWAYS_INLINE QuadRound(__m128i& state0, __m128i& state1, uint64_t k1, uint64_t k0) 23 { 24 const __m128i msg = _mm_set_epi64x(k1, k0); 25 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 26 state0 = _mm_sha256rnds2_epu32(state0, state1, _mm_shuffle_epi32(msg, 0x0e)); 27 } 28 29 void ALWAYS_INLINE QuadRound(__m128i& state0, __m128i& state1, __m128i m, uint64_t k1, uint64_t k0) 30 { 31 const __m128i msg = _mm_add_epi32(m, _mm_set_epi64x(k1, k0)); 32 state1 = _mm_sha256rnds2_epu32(state1, state0, msg); 33 state0 = _mm_sha256rnds2_epu32(state0, state1, _mm_shuffle_epi32(msg, 0x0e)); 34 } 35 36 void ALWAYS_INLINE ShiftMessageA(__m128i& m0, __m128i m1) 37 { 38 m0 = _mm_sha256msg1_epu32(m0, m1); 39 } 40 41 void ALWAYS_INLINE ShiftMessageC(__m128i& m0, __m128i m1, __m128i& m2) 42 { 43 m2 = _mm_sha256msg2_epu32(_mm_add_epi32(m2, _mm_alignr_epi8(m1, m0, 4)), m1); 44 } 45 46 void ALWAYS_INLINE ShiftMessageB(__m128i& m0, __m128i m1, __m128i& m2) 47 { 48 ShiftMessageC(m0, m1, m2); 49 ShiftMessageA(m0, m1); 50 } 51 52 void ALWAYS_INLINE Shuffle(__m128i& s0, __m128i& s1) 53 { 54 const __m128i t1 = _mm_shuffle_epi32(s0, 0xB1); 55 const __m128i t2 = _mm_shuffle_epi32(s1, 0x1B); 56 s0 = _mm_alignr_epi8(t1, t2, 0x08); 57 s1 = _mm_blend_epi16(t2, t1, 0xF0); 58 } 59 60 void ALWAYS_INLINE Unshuffle(__m128i& s0, __m128i& s1) 61 { 62 const __m128i t1 = _mm_shuffle_epi32(s0, 0x1B); 63 const __m128i t2 = _mm_shuffle_epi32(s1, 0xB1); 64 s0 = _mm_blend_epi16(t1, t2, 0xF0); 65 s1 = _mm_alignr_epi8(t2, t1, 0x08); 66 } 67 68 __m128i ALWAYS_INLINE Load(const unsigned char* in) 69 { 70 return _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)in), _mm_load_si128((const __m128i*)MASK)); 71 } 72 73 void ALWAYS_INLINE Save(unsigned char* out, __m128i s) 74 { 75 _mm_storeu_si128((__m128i*)out, _mm_shuffle_epi8(s, _mm_load_si128((const __m128i*)MASK))); 76 } 77 } 78 79 namespace sha256_x86_shani { 80 void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks) 81 { 82 __m128i m0, m1, m2, m3, s0, s1, so0, so1; 83 84 /* Load state */ 85 s0 = _mm_loadu_si128((const __m128i*)s); 86 s1 = _mm_loadu_si128((const __m128i*)(s + 4)); 87 Shuffle(s0, s1); 88 89 while (blocks--) { 90 /* Remember old state */ 91 so0 = s0; 92 so1 = s1; 93 94 /* Load data and transform */ 95 m0 = Load(chunk); 96 QuadRound(s0, s1, m0, 0xe9b5dba5b5c0fbcfull, 0x71374491428a2f98ull); 97 m1 = Load(chunk + 16); 98 QuadRound(s0, s1, m1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull); 99 ShiftMessageA(m0, m1); 100 m2 = Load(chunk + 32); 101 QuadRound(s0, s1, m2, 0x550c7dc3243185beull, 0x12835b01d807aa98ull); 102 ShiftMessageA(m1, m2); 103 m3 = Load(chunk + 48); 104 QuadRound(s0, s1, m3, 0xc19bf1749bdc06a7ull, 0x80deb1fe72be5d74ull); 105 ShiftMessageB(m2, m3, m0); 106 QuadRound(s0, s1, m0, 0x240ca1cc0fc19dc6ull, 0xefbe4786E49b69c1ull); 107 ShiftMessageB(m3, m0, m1); 108 QuadRound(s0, s1, m1, 0x76f988da5cb0a9dcull, 0x4a7484aa2de92c6full); 109 ShiftMessageB(m0, m1, m2); 110 QuadRound(s0, s1, m2, 0xbf597fc7b00327c8ull, 0xa831c66d983e5152ull); 111 ShiftMessageB(m1, m2, m3); 112 QuadRound(s0, s1, m3, 0x1429296706ca6351ull, 0xd5a79147c6e00bf3ull); 113 ShiftMessageB(m2, m3, m0); 114 QuadRound(s0, s1, m0, 0x53380d134d2c6dfcull, 0x2e1b213827b70a85ull); 115 ShiftMessageB(m3, m0, m1); 116 QuadRound(s0, s1, m1, 0x92722c8581c2c92eull, 0x766a0abb650a7354ull); 117 ShiftMessageB(m0, m1, m2); 118 QuadRound(s0, s1, m2, 0xc76c51A3c24b8b70ull, 0xa81a664ba2bfe8a1ull); 119 ShiftMessageB(m1, m2, m3); 120 QuadRound(s0, s1, m3, 0x106aa070f40e3585ull, 0xd6990624d192e819ull); 121 ShiftMessageB(m2, m3, m0); 122 QuadRound(s0, s1, m0, 0x34b0bcb52748774cull, 0x1e376c0819a4c116ull); 123 ShiftMessageB(m3, m0, m1); 124 QuadRound(s0, s1, m1, 0x682e6ff35b9cca4full, 0x4ed8aa4a391c0cb3ull); 125 ShiftMessageC(m0, m1, m2); 126 QuadRound(s0, s1, m2, 0x8cc7020884c87814ull, 0x78a5636f748f82eeull); 127 ShiftMessageC(m1, m2, m3); 128 QuadRound(s0, s1, m3, 0xc67178f2bef9A3f7ull, 0xa4506ceb90befffaull); 129 130 /* Combine with old state */ 131 s0 = _mm_add_epi32(s0, so0); 132 s1 = _mm_add_epi32(s1, so1); 133 134 /* Advance */ 135 chunk += 64; 136 } 137 138 Unshuffle(s0, s1); 139 _mm_storeu_si128((__m128i*)s, s0); 140 _mm_storeu_si128((__m128i*)(s + 4), s1); 141 } 142 } 143 144 namespace sha256d64_x86_shani { 145 146 void Transform_2way(unsigned char* out, const unsigned char* in) 147 { 148 __m128i am0, am1, am2, am3, as0, as1, aso0, aso1; 149 __m128i bm0, bm1, bm2, bm3, bs0, bs1, bso0, bso1; 150 151 /* Transform 1 */ 152 bs0 = as0 = _mm_load_si128((const __m128i*)INIT0); 153 bs1 = as1 = _mm_load_si128((const __m128i*)INIT1); 154 am0 = Load(in); 155 bm0 = Load(in + 64); 156 QuadRound(as0, as1, am0, 0xe9b5dba5b5c0fbcfull, 0x71374491428a2f98ull); 157 QuadRound(bs0, bs1, bm0, 0xe9b5dba5b5c0fbcfull, 0x71374491428a2f98ull); 158 am1 = Load(in + 16); 159 bm1 = Load(in + 80); 160 QuadRound(as0, as1, am1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull); 161 QuadRound(bs0, bs1, bm1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull); 162 ShiftMessageA(am0, am1); 163 ShiftMessageA(bm0, bm1); 164 am2 = Load(in + 32); 165 bm2 = Load(in + 96); 166 QuadRound(as0, as1, am2, 0x550c7dc3243185beull, 0x12835b01d807aa98ull); 167 QuadRound(bs0, bs1, bm2, 0x550c7dc3243185beull, 0x12835b01d807aa98ull); 168 ShiftMessageA(am1, am2); 169 ShiftMessageA(bm1, bm2); 170 am3 = Load(in + 48); 171 bm3 = Load(in + 112); 172 QuadRound(as0, as1, am3, 0xc19bf1749bdc06a7ull, 0x80deb1fe72be5d74ull); 173 QuadRound(bs0, bs1, bm3, 0xc19bf1749bdc06a7ull, 0x80deb1fe72be5d74ull); 174 ShiftMessageB(am2, am3, am0); 175 ShiftMessageB(bm2, bm3, bm0); 176 QuadRound(as0, as1, am0, 0x240ca1cc0fc19dc6ull, 0xefbe4786E49b69c1ull); 177 QuadRound(bs0, bs1, bm0, 0x240ca1cc0fc19dc6ull, 0xefbe4786E49b69c1ull); 178 ShiftMessageB(am3, am0, am1); 179 ShiftMessageB(bm3, bm0, bm1); 180 QuadRound(as0, as1, am1, 0x76f988da5cb0a9dcull, 0x4a7484aa2de92c6full); 181 QuadRound(bs0, bs1, bm1, 0x76f988da5cb0a9dcull, 0x4a7484aa2de92c6full); 182 ShiftMessageB(am0, am1, am2); 183 ShiftMessageB(bm0, bm1, bm2); 184 QuadRound(as0, as1, am2, 0xbf597fc7b00327c8ull, 0xa831c66d983e5152ull); 185 QuadRound(bs0, bs1, bm2, 0xbf597fc7b00327c8ull, 0xa831c66d983e5152ull); 186 ShiftMessageB(am1, am2, am3); 187 ShiftMessageB(bm1, bm2, bm3); 188 QuadRound(as0, as1, am3, 0x1429296706ca6351ull, 0xd5a79147c6e00bf3ull); 189 QuadRound(bs0, bs1, bm3, 0x1429296706ca6351ull, 0xd5a79147c6e00bf3ull); 190 ShiftMessageB(am2, am3, am0); 191 ShiftMessageB(bm2, bm3, bm0); 192 QuadRound(as0, as1, am0, 0x53380d134d2c6dfcull, 0x2e1b213827b70a85ull); 193 QuadRound(bs0, bs1, bm0, 0x53380d134d2c6dfcull, 0x2e1b213827b70a85ull); 194 ShiftMessageB(am3, am0, am1); 195 ShiftMessageB(bm3, bm0, bm1); 196 QuadRound(as0, as1, am1, 0x92722c8581c2c92eull, 0x766a0abb650a7354ull); 197 QuadRound(bs0, bs1, bm1, 0x92722c8581c2c92eull, 0x766a0abb650a7354ull); 198 ShiftMessageB(am0, am1, am2); 199 ShiftMessageB(bm0, bm1, bm2); 200 QuadRound(as0, as1, am2, 0xc76c51A3c24b8b70ull, 0xa81a664ba2bfe8a1ull); 201 QuadRound(bs0, bs1, bm2, 0xc76c51A3c24b8b70ull, 0xa81a664ba2bfe8a1ull); 202 ShiftMessageB(am1, am2, am3); 203 ShiftMessageB(bm1, bm2, bm3); 204 QuadRound(as0, as1, am3, 0x106aa070f40e3585ull, 0xd6990624d192e819ull); 205 QuadRound(bs0, bs1, bm3, 0x106aa070f40e3585ull, 0xd6990624d192e819ull); 206 ShiftMessageB(am2, am3, am0); 207 ShiftMessageB(bm2, bm3, bm0); 208 QuadRound(as0, as1, am0, 0x34b0bcb52748774cull, 0x1e376c0819a4c116ull); 209 QuadRound(bs0, bs1, bm0, 0x34b0bcb52748774cull, 0x1e376c0819a4c116ull); 210 ShiftMessageB(am3, am0, am1); 211 ShiftMessageB(bm3, bm0, bm1); 212 QuadRound(as0, as1, am1, 0x682e6ff35b9cca4full, 0x4ed8aa4a391c0cb3ull); 213 QuadRound(bs0, bs1, bm1, 0x682e6ff35b9cca4full, 0x4ed8aa4a391c0cb3ull); 214 ShiftMessageC(am0, am1, am2); 215 ShiftMessageC(bm0, bm1, bm2); 216 QuadRound(as0, as1, am2, 0x8cc7020884c87814ull, 0x78a5636f748f82eeull); 217 QuadRound(bs0, bs1, bm2, 0x8cc7020884c87814ull, 0x78a5636f748f82eeull); 218 ShiftMessageC(am1, am2, am3); 219 ShiftMessageC(bm1, bm2, bm3); 220 QuadRound(as0, as1, am3, 0xc67178f2bef9A3f7ull, 0xa4506ceb90befffaull); 221 QuadRound(bs0, bs1, bm3, 0xc67178f2bef9A3f7ull, 0xa4506ceb90befffaull); 222 as0 = _mm_add_epi32(as0, _mm_load_si128((const __m128i*)INIT0)); 223 bs0 = _mm_add_epi32(bs0, _mm_load_si128((const __m128i*)INIT0)); 224 as1 = _mm_add_epi32(as1, _mm_load_si128((const __m128i*)INIT1)); 225 bs1 = _mm_add_epi32(bs1, _mm_load_si128((const __m128i*)INIT1)); 226 227 /* Transform 2 */ 228 aso0 = as0; 229 bso0 = bs0; 230 aso1 = as1; 231 bso1 = bs1; 232 QuadRound(as0, as1, 0xe9b5dba5b5c0fbcfull, 0x71374491c28a2f98ull); 233 QuadRound(bs0, bs1, 0xe9b5dba5b5c0fbcfull, 0x71374491c28a2f98ull); 234 QuadRound(as0, as1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull); 235 QuadRound(bs0, bs1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull); 236 QuadRound(as0, as1, 0x550c7dc3243185beull, 0x12835b01d807aa98ull); 237 QuadRound(bs0, bs1, 0x550c7dc3243185beull, 0x12835b01d807aa98ull); 238 QuadRound(as0, as1, 0xc19bf3749bdc06a7ull, 0x80deb1fe72be5d74ull); 239 QuadRound(bs0, bs1, 0xc19bf3749bdc06a7ull, 0x80deb1fe72be5d74ull); 240 QuadRound(as0, as1, 0x240cf2540fe1edc6ull, 0xf0fe4786649b69c1ull); 241 QuadRound(bs0, bs1, 0x240cf2540fe1edc6ull, 0xf0fe4786649b69c1ull); 242 QuadRound(as0, as1, 0x16f988fa61b9411eull, 0x6cc984be4fe9346full); 243 QuadRound(bs0, bs1, 0x16f988fa61b9411eull, 0x6cc984be4fe9346full); 244 QuadRound(as0, as1, 0xb9d99ec7b019fc65ull, 0xa88e5a6df2c65152ull); 245 QuadRound(bs0, bs1, 0xb9d99ec7b019fc65ull, 0xa88e5a6df2c65152ull); 246 QuadRound(as0, as1, 0xc7353eb0fdb1232bull, 0xe70eeaa09a1231c3ull); 247 QuadRound(bs0, bs1, 0xc7353eb0fdb1232bull, 0xe70eeaa09a1231c3ull); 248 QuadRound(as0, as1, 0xdc1eeefd5a0f118full, 0xcb976d5f3069bad5ull); 249 QuadRound(bs0, bs1, 0xdc1eeefd5a0f118full, 0xcb976d5f3069bad5ull); 250 QuadRound(as0, as1, 0xe15d5b1658f4ca9dull, 0xde0b7a040a35b689ull); 251 QuadRound(bs0, bs1, 0xe15d5b1658f4ca9dull, 0xde0b7a040a35b689ull); 252 QuadRound(as0, as1, 0x6fab9537a507ea32ull, 0x37088980007f3e86ull); 253 QuadRound(bs0, bs1, 0x6fab9537a507ea32ull, 0x37088980007f3e86ull); 254 QuadRound(as0, as1, 0xc0bbbe37cdaa3b6dull, 0x0d8cd6f117406110ull); 255 QuadRound(bs0, bs1, 0xc0bbbe37cdaa3b6dull, 0x0d8cd6f117406110ull); 256 QuadRound(as0, as1, 0x6fd15ca70b02e931ull, 0xdb48a36383613bdaull); 257 QuadRound(bs0, bs1, 0x6fd15ca70b02e931ull, 0xdb48a36383613bdaull); 258 QuadRound(as0, as1, 0x6d4378906ed41a95ull, 0x31338431521afacaull); 259 QuadRound(bs0, bs1, 0x6d4378906ed41a95ull, 0x31338431521afacaull); 260 QuadRound(as0, as1, 0x532fb63cb5c9a0e6ull, 0x9eccabbdc39c91f2ull); 261 QuadRound(bs0, bs1, 0x532fb63cb5c9a0e6ull, 0x9eccabbdc39c91f2ull); 262 QuadRound(as0, as1, 0x4c191d76a4954b68ull, 0x07237ea3d2c741c6ull); 263 QuadRound(bs0, bs1, 0x4c191d76a4954b68ull, 0x07237ea3d2c741c6ull); 264 as0 = _mm_add_epi32(as0, aso0); 265 bs0 = _mm_add_epi32(bs0, bso0); 266 as1 = _mm_add_epi32(as1, aso1); 267 bs1 = _mm_add_epi32(bs1, bso1); 268 269 /* Extract hash */ 270 Unshuffle(as0, as1); 271 Unshuffle(bs0, bs1); 272 am0 = as0; 273 bm0 = bs0; 274 am1 = as1; 275 bm1 = bs1; 276 277 /* Transform 3 */ 278 bs0 = as0 = _mm_load_si128((const __m128i*)INIT0); 279 bs1 = as1 = _mm_load_si128((const __m128i*)INIT1); 280 QuadRound(as0, as1, am0, 0xe9b5dba5B5c0fbcfull, 0x71374491428a2f98ull); 281 QuadRound(bs0, bs1, bm0, 0xe9b5dba5B5c0fbcfull, 0x71374491428a2f98ull); 282 QuadRound(as0, as1, am1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull); 283 QuadRound(bs0, bs1, bm1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull); 284 ShiftMessageA(am0, am1); 285 ShiftMessageA(bm0, bm1); 286 bm2 = am2 = _mm_set_epi64x(0x0ull, 0x80000000ull); 287 QuadRound(as0, as1, 0x550c7dc3243185beull, 0x12835b015807aa98ull); 288 QuadRound(bs0, bs1, 0x550c7dc3243185beull, 0x12835b015807aa98ull); 289 ShiftMessageA(am1, am2); 290 ShiftMessageA(bm1, bm2); 291 bm3 = am3 = _mm_set_epi64x(0x10000000000ull, 0x0ull); 292 QuadRound(as0, as1, 0xc19bf2749bdc06a7ull, 0x80deb1fe72be5d74ull); 293 QuadRound(bs0, bs1, 0xc19bf2749bdc06a7ull, 0x80deb1fe72be5d74ull); 294 ShiftMessageB(am2, am3, am0); 295 ShiftMessageB(bm2, bm3, bm0); 296 QuadRound(as0, as1, am0, 0x240ca1cc0fc19dc6ull, 0xefbe4786e49b69c1ull); 297 QuadRound(bs0, bs1, bm0, 0x240ca1cc0fc19dc6ull, 0xefbe4786e49b69c1ull); 298 ShiftMessageB(am3, am0, am1); 299 ShiftMessageB(bm3, bm0, bm1); 300 QuadRound(as0, as1, am1, 0x76f988da5cb0a9dcull, 0x4a7484aa2de92c6full); 301 QuadRound(bs0, bs1, bm1, 0x76f988da5cb0a9dcull, 0x4a7484aa2de92c6full); 302 ShiftMessageB(am0, am1, am2); 303 ShiftMessageB(bm0, bm1, bm2); 304 QuadRound(as0, as1, am2, 0xbf597fc7b00327c8ull, 0xa831c66d983e5152ull); 305 QuadRound(bs0, bs1, bm2, 0xbf597fc7b00327c8ull, 0xa831c66d983e5152ull); 306 ShiftMessageB(am1, am2, am3); 307 ShiftMessageB(bm1, bm2, bm3); 308 QuadRound(as0, as1, am3, 0x1429296706ca6351ull, 0xd5a79147c6e00bf3ull); 309 QuadRound(bs0, bs1, bm3, 0x1429296706ca6351ull, 0xd5a79147c6e00bf3ull); 310 ShiftMessageB(am2, am3, am0); 311 ShiftMessageB(bm2, bm3, bm0); 312 QuadRound(as0, as1, am0, 0x53380d134d2c6dfcull, 0x2e1b213827b70a85ull); 313 QuadRound(bs0, bs1, bm0, 0x53380d134d2c6dfcull, 0x2e1b213827b70a85ull); 314 ShiftMessageB(am3, am0, am1); 315 ShiftMessageB(bm3, bm0, bm1); 316 QuadRound(as0, as1, am1, 0x92722c8581c2c92eull, 0x766a0abb650a7354ull); 317 QuadRound(bs0, bs1, bm1, 0x92722c8581c2c92eull, 0x766a0abb650a7354ull); 318 ShiftMessageB(am0, am1, am2); 319 ShiftMessageB(bm0, bm1, bm2); 320 QuadRound(as0, as1, am2, 0xc76c51a3c24b8b70ull, 0xa81a664ba2bfe8A1ull); 321 QuadRound(bs0, bs1, bm2, 0xc76c51a3c24b8b70ull, 0xa81a664ba2bfe8A1ull); 322 ShiftMessageB(am1, am2, am3); 323 ShiftMessageB(bm1, bm2, bm3); 324 QuadRound(as0, as1, am3, 0x106aa070f40e3585ull, 0xd6990624d192e819ull); 325 QuadRound(bs0, bs1, bm3, 0x106aa070f40e3585ull, 0xd6990624d192e819ull); 326 ShiftMessageB(am2, am3, am0); 327 ShiftMessageB(bm2, bm3, bm0); 328 QuadRound(as0, as1, am0, 0x34b0bcb52748774cull, 0x1e376c0819a4c116ull); 329 QuadRound(bs0, bs1, bm0, 0x34b0bcb52748774cull, 0x1e376c0819a4c116ull); 330 ShiftMessageB(am3, am0, am1); 331 ShiftMessageB(bm3, bm0, bm1); 332 QuadRound(as0, as1, am1, 0x682e6ff35b9cca4full, 0x4ed8aa4a391c0cb3ull); 333 QuadRound(bs0, bs1, bm1, 0x682e6ff35b9cca4full, 0x4ed8aa4a391c0cb3ull); 334 ShiftMessageC(am0, am1, am2); 335 ShiftMessageC(bm0, bm1, bm2); 336 QuadRound(as0, as1, am2, 0x8cc7020884c87814ull, 0x78a5636f748f82eeull); 337 QuadRound(bs0, bs1, bm2, 0x8cc7020884c87814ull, 0x78a5636f748f82eeull); 338 ShiftMessageC(am1, am2, am3); 339 ShiftMessageC(bm1, bm2, bm3); 340 QuadRound(as0, as1, am3, 0xc67178f2bef9a3f7ull, 0xa4506ceb90befffaull); 341 QuadRound(bs0, bs1, bm3, 0xc67178f2bef9a3f7ull, 0xa4506ceb90befffaull); 342 as0 = _mm_add_epi32(as0, _mm_load_si128((const __m128i*)INIT0)); 343 bs0 = _mm_add_epi32(bs0, _mm_load_si128((const __m128i*)INIT0)); 344 as1 = _mm_add_epi32(as1, _mm_load_si128((const __m128i*)INIT1)); 345 bs1 = _mm_add_epi32(bs1, _mm_load_si128((const __m128i*)INIT1)); 346 347 /* Extract hash into out */ 348 Unshuffle(as0, as1); 349 Unshuffle(bs0, bs1); 350 Save(out, as0); 351 Save(out + 16, as1); 352 Save(out + 32, bs0); 353 Save(out + 48, bs1); 354 } 355 356 } 357 358 #endif