/ src / crypto / sha256_arm_shani.cpp
sha256_arm_shani.cpp
  1  // Copyright (c) 2022-present The Bitcoin Core developers
  2  // Distributed under the MIT software license, see the accompanying
  3  // file COPYING or http://www.opensource.org/licenses/mit-license.php.
  4  //
  5  // Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
  6  // Written and placed in public domain by Jeffrey Walton.
  7  // Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
  8  // Barry O'Rourke for the mbedTLS project.
  9  // Variant specialized for 64-byte inputs added by Pieter Wuille.
 10  
 11  #ifdef ENABLE_ARM_SHANI
 12  
 13  #include <array>
 14  #include <cstdint>
 15  #include <cstddef>
 16  #include <arm_neon.h>
 17  
 18  namespace {
 19  alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
 20  {
 21      0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
 22      0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
 23      0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
 24      0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
 25      0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
 26      0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
 27      0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
 28      0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
 29      0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
 30      0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
 31      0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
 32      0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
 33      0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
 34      0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
 35      0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
 36      0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
 37  };
 38  }
 39  
 40  namespace sha256_arm_shani {
 41  void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
 42  {
 43      uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
 44      uint32x4_t MSG0, MSG1, MSG2, MSG3;
 45      uint32x4_t TMP0, TMP2;
 46  
 47      // Load state
 48      STATE0 = vld1q_u32(&s[0]);
 49      STATE1 = vld1q_u32(&s[4]);
 50  
 51      while (blocks--)
 52      {
 53          // Save state
 54          ABEF_SAVE = STATE0;
 55          CDGH_SAVE = STATE1;
 56  
 57          // Load and convert input chunk to Big Endian
 58          MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
 59          MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
 60          MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
 61          MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
 62          chunk += 64;
 63  
 64          // Original implementation preloaded message and constant addition which was 1-3% slower.
 65          // Now included as first step in quad round code saving one Q Neon register
 66          // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
 67  
 68          // Rounds 1-4
 69          TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
 70          TMP2 = STATE0;
 71          MSG0 = vsha256su0q_u32(MSG0, MSG1);
 72          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
 73          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
 74          MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
 75  
 76          // Rounds 5-8
 77          TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
 78          TMP2 = STATE0;
 79          MSG1 = vsha256su0q_u32(MSG1, MSG2);
 80          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
 81          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
 82          MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
 83  
 84          // Rounds 9-12
 85          TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
 86          TMP2 = STATE0;
 87          MSG2 = vsha256su0q_u32(MSG2, MSG3);
 88          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
 89          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
 90          MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
 91  
 92          // Rounds 13-16
 93          TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
 94          TMP2 = STATE0;
 95          MSG3 = vsha256su0q_u32(MSG3, MSG0);
 96          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
 97          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
 98          MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
 99  
100          // Rounds 17-20
101          TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
102          TMP2 = STATE0;
103          MSG0 = vsha256su0q_u32(MSG0, MSG1);
104          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
105          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
106          MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
107  
108          // Rounds 21-24
109          TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
110          TMP2 = STATE0;
111          MSG1 = vsha256su0q_u32(MSG1, MSG2);
112          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
113          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
114          MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
115  
116          // Rounds 25-28
117          TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
118          TMP2 = STATE0;
119          MSG2 = vsha256su0q_u32(MSG2, MSG3);
120          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
121          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
122          MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
123  
124          // Rounds 29-32
125          TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
126          TMP2 = STATE0;
127          MSG3 = vsha256su0q_u32(MSG3, MSG0);
128          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
129          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
130          MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
131  
132          // Rounds 33-36
133          TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
134          TMP2 = STATE0;
135          MSG0 = vsha256su0q_u32(MSG0, MSG1);
136          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
137          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
138          MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
139  
140          // Rounds 37-40
141          TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
142          TMP2 = STATE0;
143          MSG1 = vsha256su0q_u32(MSG1, MSG2);
144          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
145          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
146          MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
147  
148          // Rounds 41-44
149          TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
150          TMP2 = STATE0;
151          MSG2 = vsha256su0q_u32(MSG2, MSG3);
152          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
153          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
154          MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
155  
156          // Rounds 45-48
157          TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
158          TMP2 = STATE0;
159          MSG3 = vsha256su0q_u32(MSG3, MSG0);
160          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
161          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
162          MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
163  
164          // Rounds 49-52
165          TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
166          TMP2 = STATE0;
167          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
168          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
169  
170          // Rounds 53-56
171          TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
172          TMP2 = STATE0;
173          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
174          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
175  
176          // Rounds 57-60
177          TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
178          TMP2 = STATE0;
179          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
180          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
181  
182          // Rounds 61-64
183          TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
184          TMP2 = STATE0;
185          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
186          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
187  
188          // Update state
189          STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
190          STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
191      }
192  
193      // Save final state
194      vst1q_u32(&s[0], STATE0);
195      vst1q_u32(&s[4], STATE1);
196  }
197  }
198  
199  namespace sha256d64_arm_shani {
200  void Transform_2way(unsigned char* output, const unsigned char* input)
201  {
202      /* Initial state. */
203      alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
204          0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
205          0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
206      };
207  
208      /* Precomputed message schedule for the 2nd transform. */
209      alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
210          0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
211          0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
212          0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
213          0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
214          0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
215          0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
216          0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
217          0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
218          0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
219          0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
220          0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
221          0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
222          0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
223          0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
224          0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
225          0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
226      };
227  
228      /* A few precomputed message schedule values for the 3rd transform. */
229      alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
230          0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
231          0x80000000, 0x00000000, 0x00000000, 0x00000000,
232          0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
233      };
234  
235      /* Padding processed in the 3rd transform (byteswapped). */
236      alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
237  
238      uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABEF_SAVEA, ABEF_SAVEB, CDGH_SAVEA, CDGH_SAVEB;
239      uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
240      uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
241  
242      // Transform 1: Load state
243      STATE0A = vld1q_u32(&INIT[0]);
244      STATE0B = STATE0A;
245      STATE1A = vld1q_u32(&INIT[4]);
246      STATE1B = STATE1A;
247  
248      // Transform 1: Load and convert input chunk to Big Endian
249      MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
250      MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
251      MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
252      MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
253      MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
254      MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
255      MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
256      MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
257  
258      // Transform 1: Rounds 1-4
259      TMP = vld1q_u32(&K[0]);
260      TMP0A = vaddq_u32(MSG0A, TMP);
261      TMP0B = vaddq_u32(MSG0B, TMP);
262      TMP2A = STATE0A;
263      TMP2B = STATE0B;
264      MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
265      MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
266      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
267      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
268      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
269      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
270      MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
271      MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
272  
273      // Transform 1: Rounds 5-8
274      TMP = vld1q_u32(&K[4]);
275      TMP0A = vaddq_u32(MSG1A, TMP);
276      TMP0B = vaddq_u32(MSG1B, TMP);
277      TMP2A = STATE0A;
278      TMP2B = STATE0B;
279      MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
280      MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
281      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
282      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
283      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
284      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
285      MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
286      MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
287  
288      // Transform 1: Rounds 9-12
289      TMP = vld1q_u32(&K[8]);
290      TMP0A = vaddq_u32(MSG2A, TMP);
291      TMP0B = vaddq_u32(MSG2B, TMP);
292      TMP2A = STATE0A;
293      TMP2B = STATE0B;
294      MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
295      MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
296      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
297      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
298      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
299      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
300      MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
301      MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
302  
303      // Transform 1: Rounds 13-16
304      TMP = vld1q_u32(&K[12]);
305      TMP0A = vaddq_u32(MSG3A, TMP);
306      TMP0B = vaddq_u32(MSG3B, TMP);
307      TMP2A = STATE0A;
308      TMP2B = STATE0B;
309      MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
310      MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
311      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
312      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
313      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
314      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
315      MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
316      MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
317  
318      // Transform 1: Rounds 17-20
319      TMP = vld1q_u32(&K[16]);
320      TMP0A = vaddq_u32(MSG0A, TMP);
321      TMP0B = vaddq_u32(MSG0B, TMP);
322      TMP2A = STATE0A;
323      TMP2B = STATE0B;
324      MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
325      MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
326      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
327      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
328      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
329      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
330      MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
331      MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
332  
333      // Transform 1: Rounds 21-24
334      TMP = vld1q_u32(&K[20]);
335      TMP0A = vaddq_u32(MSG1A, TMP);
336      TMP0B = vaddq_u32(MSG1B, TMP);
337      TMP2A = STATE0A;
338      TMP2B = STATE0B;
339      MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
340      MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
341      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
342      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
343      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
344      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
345      MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
346      MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
347  
348      // Transform 1: Rounds 25-28
349      TMP = vld1q_u32(&K[24]);
350      TMP0A = vaddq_u32(MSG2A, TMP);
351      TMP0B = vaddq_u32(MSG2B, TMP);
352      TMP2A = STATE0A;
353      TMP2B = STATE0B;
354      MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
355      MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
356      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
357      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
358      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
359      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
360      MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
361      MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
362  
363      // Transform 1: Rounds 29-32
364      TMP = vld1q_u32(&K[28]);
365      TMP0A = vaddq_u32(MSG3A, TMP);
366      TMP0B = vaddq_u32(MSG3B, TMP);
367      TMP2A = STATE0A;
368      TMP2B = STATE0B;
369      MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
370      MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
371      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
372      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
373      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
374      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
375      MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
376      MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
377  
378      // Transform 1: Rounds 33-36
379      TMP = vld1q_u32(&K[32]);
380      TMP0A = vaddq_u32(MSG0A, TMP);
381      TMP0B = vaddq_u32(MSG0B, TMP);
382      TMP2A = STATE0A;
383      TMP2B = STATE0B;
384      MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
385      MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
386      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
387      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
388      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
389      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
390      MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
391      MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
392  
393      // Transform 1: Rounds 37-40
394      TMP = vld1q_u32(&K[36]);
395      TMP0A = vaddq_u32(MSG1A, TMP);
396      TMP0B = vaddq_u32(MSG1B, TMP);
397      TMP2A = STATE0A;
398      TMP2B = STATE0B;
399      MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
400      MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
401      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
402      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
403      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
404      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
405      MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
406      MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
407  
408      // Transform 1: Rounds 41-44
409      TMP = vld1q_u32(&K[40]);
410      TMP0A = vaddq_u32(MSG2A, TMP);
411      TMP0B = vaddq_u32(MSG2B, TMP);
412      TMP2A = STATE0A;
413      TMP2B = STATE0B;
414      MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
415      MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
416      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
417      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
418      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
419      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
420      MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
421      MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
422  
423      // Transform 1: Rounds 45-48
424      TMP = vld1q_u32(&K[44]);
425      TMP0A = vaddq_u32(MSG3A, TMP);
426      TMP0B = vaddq_u32(MSG3B, TMP);
427      TMP2A = STATE0A;
428      TMP2B = STATE0B;
429      MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
430      MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
431      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
432      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
433      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
434      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
435      MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
436      MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
437  
438      // Transform 1: Rounds 49-52
439      TMP = vld1q_u32(&K[48]);
440      TMP0A = vaddq_u32(MSG0A, TMP);
441      TMP0B = vaddq_u32(MSG0B, TMP);
442      TMP2A = STATE0A;
443      TMP2B = STATE0B;
444      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
445      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
446      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
447      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
448  
449      // Transform 1: Rounds 53-56
450      TMP = vld1q_u32(&K[52]);
451      TMP0A = vaddq_u32(MSG1A, TMP);
452      TMP0B = vaddq_u32(MSG1B, TMP);
453      TMP2A = STATE0A;
454      TMP2B = STATE0B;
455      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
456      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
457      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
458      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
459  
460      // Transform 1: Rounds 57-60
461      TMP = vld1q_u32(&K[56]);
462      TMP0A = vaddq_u32(MSG2A, TMP);
463      TMP0B = vaddq_u32(MSG2B, TMP);
464      TMP2A = STATE0A;
465      TMP2B = STATE0B;
466      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
467      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
468      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
469      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
470  
471      // Transform 1: Rounds 61-64
472      TMP = vld1q_u32(&K[60]);
473      TMP0A = vaddq_u32(MSG3A, TMP);
474      TMP0B = vaddq_u32(MSG3B, TMP);
475      TMP2A = STATE0A;
476      TMP2B = STATE0B;
477      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
478      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
479      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
480      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
481  
482      // Transform 1: Update state
483      TMP = vld1q_u32(&INIT[0]);
484      STATE0A = vaddq_u32(STATE0A, TMP);
485      STATE0B = vaddq_u32(STATE0B, TMP);
486      TMP = vld1q_u32(&INIT[4]);
487      STATE1A = vaddq_u32(STATE1A, TMP);
488      STATE1B = vaddq_u32(STATE1B, TMP);
489  
490      // Transform 2: Save state
491      ABEF_SAVEA = STATE0A;
492      ABEF_SAVEB = STATE0B;
493      CDGH_SAVEA = STATE1A;
494      CDGH_SAVEB = STATE1B;
495  
496      // Transform 2: Rounds 1-4
497      TMP = vld1q_u32(&MIDS[0]);
498      TMP2A = STATE0A;
499      TMP2B = STATE0B;
500      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
501      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
502      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
503      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
504  
505      // Transform 2: Rounds 5-8
506      TMP = vld1q_u32(&MIDS[4]);
507      TMP2A = STATE0A;
508      TMP2B = STATE0B;
509      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
510      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
511      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
512      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
513  
514      // Transform 2: Rounds 9-12
515      TMP = vld1q_u32(&MIDS[8]);
516      TMP2A = STATE0A;
517      TMP2B = STATE0B;
518      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
519      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
520      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
521      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
522  
523      // Transform 2: Rounds 13-16
524      TMP = vld1q_u32(&MIDS[12]);
525      TMP2A = STATE0A;
526      TMP2B = STATE0B;
527      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
528      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
529      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
530      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
531  
532      // Transform 2: Rounds 17-20
533      TMP = vld1q_u32(&MIDS[16]);
534      TMP2A = STATE0A;
535      TMP2B = STATE0B;
536      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
537      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
538      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
539      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
540  
541      // Transform 2: Rounds 21-24
542      TMP = vld1q_u32(&MIDS[20]);
543      TMP2A = STATE0A;
544      TMP2B = STATE0B;
545      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
546      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
547      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
548      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
549  
550      // Transform 2: Rounds 25-28
551      TMP = vld1q_u32(&MIDS[24]);
552      TMP2A = STATE0A;
553      TMP2B = STATE0B;
554      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
555      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
556      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
557      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
558  
559      // Transform 2: Rounds 29-32
560      TMP = vld1q_u32(&MIDS[28]);
561      TMP2A = STATE0A;
562      TMP2B = STATE0B;
563      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
564      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
565      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
566      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
567  
568      // Transform 2: Rounds 33-36
569      TMP = vld1q_u32(&MIDS[32]);
570      TMP2A = STATE0A;
571      TMP2B = STATE0B;
572      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
573      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
574      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
575      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
576  
577      // Transform 2: Rounds 37-40
578      TMP = vld1q_u32(&MIDS[36]);
579      TMP2A = STATE0A;
580      TMP2B = STATE0B;
581      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
582      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
583      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
584      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
585  
586      // Transform 2: Rounds 41-44
587      TMP = vld1q_u32(&MIDS[40]);
588      TMP2A = STATE0A;
589      TMP2B = STATE0B;
590      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
591      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
592      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
593      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
594  
595      // Transform 2: Rounds 45-48
596      TMP = vld1q_u32(&MIDS[44]);
597      TMP2A = STATE0A;
598      TMP2B = STATE0B;
599      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
600      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
601      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
602      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
603  
604      // Transform 2: Rounds 49-52
605      TMP = vld1q_u32(&MIDS[48]);
606      TMP2A = STATE0A;
607      TMP2B = STATE0B;
608      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
609      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
610      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
611      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
612  
613      // Transform 2: Rounds 53-56
614      TMP = vld1q_u32(&MIDS[52]);
615      TMP2A = STATE0A;
616      TMP2B = STATE0B;
617      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
618      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
619      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
620      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
621  
622      // Transform 2: Rounds 57-60
623      TMP = vld1q_u32(&MIDS[56]);
624      TMP2A = STATE0A;
625      TMP2B = STATE0B;
626      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
627      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
628      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
629      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
630  
631      // Transform 2: Rounds 61-64
632      TMP = vld1q_u32(&MIDS[60]);
633      TMP2A = STATE0A;
634      TMP2B = STATE0B;
635      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
636      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
637      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
638      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
639  
640      // Transform 2: Update state
641      STATE0A = vaddq_u32(STATE0A, ABEF_SAVEA);
642      STATE0B = vaddq_u32(STATE0B, ABEF_SAVEB);
643      STATE1A = vaddq_u32(STATE1A, CDGH_SAVEA);
644      STATE1B = vaddq_u32(STATE1B, CDGH_SAVEB);
645  
646      // Transform 3: Pad previous output
647      MSG0A = STATE0A;
648      MSG0B = STATE0B;
649      MSG1A = STATE1A;
650      MSG1B = STATE1B;
651      MSG2A = vld1q_u32(&FINAL[0]);
652      MSG2B = MSG2A;
653      MSG3A = vld1q_u32(&FINAL[4]);
654      MSG3B = MSG3A;
655  
656      // Transform 3: Load state
657      STATE0A = vld1q_u32(&INIT[0]);
658      STATE0B = STATE0A;
659      STATE1A = vld1q_u32(&INIT[4]);
660      STATE1B = STATE1A;
661  
662      // Transform 3: Rounds 1-4
663      TMP = vld1q_u32(&K[0]);
664      TMP0A = vaddq_u32(MSG0A, TMP);
665      TMP0B = vaddq_u32(MSG0B, TMP);
666      TMP2A = STATE0A;
667      TMP2B = STATE0B;
668      MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
669      MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
670      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
671      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
672      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
673      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
674      MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
675      MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
676  
677      // Transform 3: Rounds 5-8
678      TMP = vld1q_u32(&K[4]);
679      TMP0A = vaddq_u32(MSG1A, TMP);
680      TMP0B = vaddq_u32(MSG1B, TMP);
681      TMP2A = STATE0A;
682      TMP2B = STATE0B;
683      MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
684      MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
685      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
686      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
687      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
688      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
689      MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
690      MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
691  
692      // Transform 3: Rounds 9-12
693      TMP = vld1q_u32(&FINS[0]);
694      TMP2A = STATE0A;
695      TMP2B = STATE0B;
696      MSG2A = vld1q_u32(&FINS[4]);
697      MSG2B = MSG2A;
698      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
699      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
700      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
701      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
702      MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
703      MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
704  
705      // Transform 3: Rounds 13-16
706      TMP = vld1q_u32(&FINS[8]);
707      TMP2A = STATE0A;
708      TMP2B = STATE0B;
709      MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
710      MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
711      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
712      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
713      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
714      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
715      MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
716      MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
717  
718      // Transform 3: Rounds 17-20
719      TMP = vld1q_u32(&K[16]);
720      TMP0A = vaddq_u32(MSG0A, TMP);
721      TMP0B = vaddq_u32(MSG0B, TMP);
722      TMP2A = STATE0A;
723      TMP2B = STATE0B;
724      MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
725      MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
726      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
727      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
728      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
729      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
730      MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
731      MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
732  
733      // Transform 3: Rounds 21-24
734      TMP = vld1q_u32(&K[20]);
735      TMP0A = vaddq_u32(MSG1A, TMP);
736      TMP0B = vaddq_u32(MSG1B, TMP);
737      TMP2A = STATE0A;
738      TMP2B = STATE0B;
739      MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
740      MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
741      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
742      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
743      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
744      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
745      MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
746      MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
747  
748      // Transform 3: Rounds 25-28
749      TMP = vld1q_u32(&K[24]);
750      TMP0A = vaddq_u32(MSG2A, TMP);
751      TMP0B = vaddq_u32(MSG2B, TMP);
752      TMP2A = STATE0A;
753      TMP2B = STATE0B;
754      MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
755      MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
756      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
757      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
758      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
759      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
760      MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
761      MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
762  
763      // Transform 3: Rounds 29-32
764      TMP = vld1q_u32(&K[28]);
765      TMP0A = vaddq_u32(MSG3A, TMP);
766      TMP0B = vaddq_u32(MSG3B, TMP);
767      TMP2A = STATE0A;
768      TMP2B = STATE0B;
769      MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
770      MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
771      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
772      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
773      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
774      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
775      MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
776      MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
777  
778      // Transform 3: Rounds 33-36
779      TMP = vld1q_u32(&K[32]);
780      TMP0A = vaddq_u32(MSG0A, TMP);
781      TMP0B = vaddq_u32(MSG0B, TMP);
782      TMP2A = STATE0A;
783      TMP2B = STATE0B;
784      MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
785      MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
786      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
787      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
788      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
789      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
790      MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
791      MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
792  
793      // Transform 3: Rounds 37-40
794      TMP = vld1q_u32(&K[36]);
795      TMP0A = vaddq_u32(MSG1A, TMP);
796      TMP0B = vaddq_u32(MSG1B, TMP);
797      TMP2A = STATE0A;
798      TMP2B = STATE0B;
799      MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
800      MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
801      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
802      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
803      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
804      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
805      MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
806      MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
807  
808      // Transform 3: Rounds 41-44
809      TMP = vld1q_u32(&K[40]);
810      TMP0A = vaddq_u32(MSG2A, TMP);
811      TMP0B = vaddq_u32(MSG2B, TMP);
812      TMP2A = STATE0A;
813      TMP2B = STATE0B;
814      MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
815      MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
816      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
817      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
818      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
819      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
820      MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
821      MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
822  
823      // Transform 3: Rounds 45-48
824      TMP = vld1q_u32(&K[44]);
825      TMP0A = vaddq_u32(MSG3A, TMP);
826      TMP0B = vaddq_u32(MSG3B, TMP);
827      TMP2A = STATE0A;
828      TMP2B = STATE0B;
829      MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
830      MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
831      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
832      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
833      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
834      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
835      MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
836      MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
837  
838      // Transform 3: Rounds 49-52
839      TMP = vld1q_u32(&K[48]);
840      TMP0A = vaddq_u32(MSG0A, TMP);
841      TMP0B = vaddq_u32(MSG0B, TMP);
842      TMP2A = STATE0A;
843      TMP2B = STATE0B;
844      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
845      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
846      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
847      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
848  
849      // Transform 3: Rounds 53-56
850      TMP = vld1q_u32(&K[52]);
851      TMP0A = vaddq_u32(MSG1A, TMP);
852      TMP0B = vaddq_u32(MSG1B, TMP);
853      TMP2A = STATE0A;
854      TMP2B = STATE0B;
855      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
856      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
857      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
858      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
859  
860      // Transform 3: Rounds 57-60
861      TMP = vld1q_u32(&K[56]);
862      TMP0A = vaddq_u32(MSG2A, TMP);
863      TMP0B = vaddq_u32(MSG2B, TMP);
864      TMP2A = STATE0A;
865      TMP2B = STATE0B;
866      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
867      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
868      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
869      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
870  
871      // Transform 3: Rounds 61-64
872      TMP = vld1q_u32(&K[60]);
873      TMP0A = vaddq_u32(MSG3A, TMP);
874      TMP0B = vaddq_u32(MSG3B, TMP);
875      TMP2A = STATE0A;
876      TMP2B = STATE0B;
877      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
878      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
879      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
880      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
881  
882      // Transform 3: Update state
883      TMP = vld1q_u32(&INIT[0]);
884      STATE0A = vaddq_u32(STATE0A, TMP);
885      STATE0B = vaddq_u32(STATE0B, TMP);
886      TMP = vld1q_u32(&INIT[4]);
887      STATE1A = vaddq_u32(STATE1A, TMP);
888      STATE1B = vaddq_u32(STATE1B, TMP);
889  
890      // Store result
891      vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
892      vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
893      vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
894      vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
895  }
896  }
897  
898  #endif