/ src / crypto / sha256_arm_shani.cpp
sha256_arm_shani.cpp
  1  // Copyright (c) 2022 The Bitcoin Core developers
  2  // Distributed under the MIT software license, see the accompanying
  3  // file COPYING or http://www.opensource.org/licenses/mit-license.php.
  4  //
  5  // Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
  6  // Written and placed in public domain by Jeffrey Walton.
  7  // Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
  8  // Barry O'Rourke for the mbedTLS project.
  9  // Variant specialized for 64-byte inputs added by Pieter Wuille.
 10  
 11  #ifdef ENABLE_ARM_SHANI
 12  
 13  #include <array>
 14  #include <cstdint>
 15  #include <cstddef>
 16  #include <arm_acle.h>
 17  #include <arm_neon.h>
 18  
 19  namespace {
 20  alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
 21  {
 22      0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
 23      0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
 24      0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
 25      0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
 26      0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
 27      0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
 28      0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
 29      0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
 30      0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
 31      0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
 32      0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
 33      0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
 34      0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
 35      0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
 36      0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
 37      0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
 38  };
 39  }
 40  
 41  namespace sha256_arm_shani {
 42  void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
 43  {
 44      uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
 45      uint32x4_t MSG0, MSG1, MSG2, MSG3;
 46      uint32x4_t TMP0, TMP2;
 47  
 48      // Load state
 49      STATE0 = vld1q_u32(&s[0]);
 50      STATE1 = vld1q_u32(&s[4]);
 51  
 52      while (blocks--)
 53      {
 54          // Save state
 55          ABEF_SAVE = STATE0;
 56          CDGH_SAVE = STATE1;
 57  
 58          // Load and convert input chunk to Big Endian
 59          MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
 60          MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
 61          MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
 62          MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
 63          chunk += 64;
 64  
 65          // Original implementation preloaded message and constant addition which was 1-3% slower.
 66          // Now included as first step in quad round code saving one Q Neon register
 67          // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
 68  
 69          // Rounds 1-4
 70          TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
 71          TMP2 = STATE0;
 72          MSG0 = vsha256su0q_u32(MSG0, MSG1);
 73          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
 74          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
 75          MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
 76  
 77          // Rounds 5-8
 78          TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
 79          TMP2 = STATE0;
 80          MSG1 = vsha256su0q_u32(MSG1, MSG2);
 81          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
 82          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
 83          MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
 84  
 85          // Rounds 9-12
 86          TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
 87          TMP2 = STATE0;
 88          MSG2 = vsha256su0q_u32(MSG2, MSG3);
 89          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
 90          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
 91          MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
 92  
 93          // Rounds 13-16
 94          TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
 95          TMP2 = STATE0;
 96          MSG3 = vsha256su0q_u32(MSG3, MSG0);
 97          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
 98          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
 99          MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
100  
101          // Rounds 17-20
102          TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
103          TMP2 = STATE0;
104          MSG0 = vsha256su0q_u32(MSG0, MSG1);
105          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
106          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
107          MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
108  
109          // Rounds 21-24
110          TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
111          TMP2 = STATE0;
112          MSG1 = vsha256su0q_u32(MSG1, MSG2);
113          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
114          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
115          MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
116  
117          // Rounds 25-28
118          TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
119          TMP2 = STATE0;
120          MSG2 = vsha256su0q_u32(MSG2, MSG3);
121          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
122          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
123          MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
124  
125          // Rounds 29-32
126          TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
127          TMP2 = STATE0;
128          MSG3 = vsha256su0q_u32(MSG3, MSG0);
129          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
130          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
131          MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
132  
133          // Rounds 33-36
134          TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
135          TMP2 = STATE0;
136          MSG0 = vsha256su0q_u32(MSG0, MSG1);
137          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
138          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
139          MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
140  
141          // Rounds 37-40
142          TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
143          TMP2 = STATE0;
144          MSG1 = vsha256su0q_u32(MSG1, MSG2);
145          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
146          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
147          MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
148  
149          // Rounds 41-44
150          TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
151          TMP2 = STATE0;
152          MSG2 = vsha256su0q_u32(MSG2, MSG3);
153          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
154          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
155          MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
156  
157          // Rounds 45-48
158          TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
159          TMP2 = STATE0;
160          MSG3 = vsha256su0q_u32(MSG3, MSG0);
161          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
162          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
163          MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
164  
165          // Rounds 49-52
166          TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
167          TMP2 = STATE0;
168          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
169          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
170  
171          // Rounds 53-56
172          TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
173          TMP2 = STATE0;
174          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
175          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
176  
177          // Rounds 57-60
178          TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
179          TMP2 = STATE0;
180          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
181          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
182  
183          // Rounds 61-64
184          TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
185          TMP2 = STATE0;
186          STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
187          STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
188  
189          // Update state
190          STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
191          STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
192      }
193  
194      // Save final state
195      vst1q_u32(&s[0], STATE0);
196      vst1q_u32(&s[4], STATE1);
197  }
198  }
199  
200  namespace sha256d64_arm_shani {
201  void Transform_2way(unsigned char* output, const unsigned char* input)
202  {
203      /* Initial state. */
204      alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
205          0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
206          0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
207      };
208  
209      /* Precomputed message schedule for the 2nd transform. */
210      alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
211          0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
212          0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
213          0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
214          0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
215          0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
216          0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
217          0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
218          0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
219          0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
220          0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
221          0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
222          0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
223          0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
224          0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
225          0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
226          0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
227      };
228  
229      /* A few precomputed message schedule values for the 3rd transform. */
230      alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
231          0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
232          0x80000000, 0x00000000, 0x00000000, 0x00000000,
233          0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
234      };
235  
236      /* Padding processed in the 3rd transform (byteswapped). */
237      alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
238  
239      uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABEF_SAVEA, ABEF_SAVEB, CDGH_SAVEA, CDGH_SAVEB;
240      uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
241      uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
242  
243      // Transform 1: Load state
244      STATE0A = vld1q_u32(&INIT[0]);
245      STATE0B = STATE0A;
246      STATE1A = vld1q_u32(&INIT[4]);
247      STATE1B = STATE1A;
248  
249      // Transform 1: Load and convert input chunk to Big Endian
250      MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
251      MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
252      MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
253      MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
254      MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
255      MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
256      MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
257      MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
258  
259      // Transform 1: Rounds 1-4
260      TMP = vld1q_u32(&K[0]);
261      TMP0A = vaddq_u32(MSG0A, TMP);
262      TMP0B = vaddq_u32(MSG0B, TMP);
263      TMP2A = STATE0A;
264      TMP2B = STATE0B;
265      MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
266      MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
267      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
268      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
269      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
270      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
271      MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
272      MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
273  
274      // Transform 1: Rounds 5-8
275      TMP = vld1q_u32(&K[4]);
276      TMP0A = vaddq_u32(MSG1A, TMP);
277      TMP0B = vaddq_u32(MSG1B, TMP);
278      TMP2A = STATE0A;
279      TMP2B = STATE0B;
280      MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
281      MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
282      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
283      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
284      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
285      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
286      MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
287      MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
288  
289      // Transform 1: Rounds 9-12
290      TMP = vld1q_u32(&K[8]);
291      TMP0A = vaddq_u32(MSG2A, TMP);
292      TMP0B = vaddq_u32(MSG2B, TMP);
293      TMP2A = STATE0A;
294      TMP2B = STATE0B;
295      MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
296      MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
297      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
298      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
299      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
300      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
301      MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
302      MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
303  
304      // Transform 1: Rounds 13-16
305      TMP = vld1q_u32(&K[12]);
306      TMP0A = vaddq_u32(MSG3A, TMP);
307      TMP0B = vaddq_u32(MSG3B, TMP);
308      TMP2A = STATE0A;
309      TMP2B = STATE0B;
310      MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
311      MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
312      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
313      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
314      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
315      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
316      MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
317      MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
318  
319      // Transform 1: Rounds 17-20
320      TMP = vld1q_u32(&K[16]);
321      TMP0A = vaddq_u32(MSG0A, TMP);
322      TMP0B = vaddq_u32(MSG0B, TMP);
323      TMP2A = STATE0A;
324      TMP2B = STATE0B;
325      MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
326      MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
327      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
328      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
329      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
330      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
331      MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
332      MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
333  
334      // Transform 1: Rounds 21-24
335      TMP = vld1q_u32(&K[20]);
336      TMP0A = vaddq_u32(MSG1A, TMP);
337      TMP0B = vaddq_u32(MSG1B, TMP);
338      TMP2A = STATE0A;
339      TMP2B = STATE0B;
340      MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
341      MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
342      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
343      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
344      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
345      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
346      MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
347      MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
348  
349      // Transform 1: Rounds 25-28
350      TMP = vld1q_u32(&K[24]);
351      TMP0A = vaddq_u32(MSG2A, TMP);
352      TMP0B = vaddq_u32(MSG2B, TMP);
353      TMP2A = STATE0A;
354      TMP2B = STATE0B;
355      MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
356      MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
357      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
358      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
359      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
360      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
361      MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
362      MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
363  
364      // Transform 1: Rounds 29-32
365      TMP = vld1q_u32(&K[28]);
366      TMP0A = vaddq_u32(MSG3A, TMP);
367      TMP0B = vaddq_u32(MSG3B, TMP);
368      TMP2A = STATE0A;
369      TMP2B = STATE0B;
370      MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
371      MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
372      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
373      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
374      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
375      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
376      MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
377      MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
378  
379      // Transform 1: Rounds 33-36
380      TMP = vld1q_u32(&K[32]);
381      TMP0A = vaddq_u32(MSG0A, TMP);
382      TMP0B = vaddq_u32(MSG0B, TMP);
383      TMP2A = STATE0A;
384      TMP2B = STATE0B;
385      MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
386      MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
387      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
388      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
389      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
390      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
391      MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
392      MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
393  
394      // Transform 1: Rounds 37-40
395      TMP = vld1q_u32(&K[36]);
396      TMP0A = vaddq_u32(MSG1A, TMP);
397      TMP0B = vaddq_u32(MSG1B, TMP);
398      TMP2A = STATE0A;
399      TMP2B = STATE0B;
400      MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
401      MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
402      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
403      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
404      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
405      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
406      MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
407      MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
408  
409      // Transform 1: Rounds 41-44
410      TMP = vld1q_u32(&K[40]);
411      TMP0A = vaddq_u32(MSG2A, TMP);
412      TMP0B = vaddq_u32(MSG2B, TMP);
413      TMP2A = STATE0A;
414      TMP2B = STATE0B;
415      MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
416      MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
417      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
418      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
419      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
420      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
421      MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
422      MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
423  
424      // Transform 1: Rounds 45-48
425      TMP = vld1q_u32(&K[44]);
426      TMP0A = vaddq_u32(MSG3A, TMP);
427      TMP0B = vaddq_u32(MSG3B, TMP);
428      TMP2A = STATE0A;
429      TMP2B = STATE0B;
430      MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
431      MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
432      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
433      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
434      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
435      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
436      MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
437      MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
438  
439      // Transform 1: Rounds 49-52
440      TMP = vld1q_u32(&K[48]);
441      TMP0A = vaddq_u32(MSG0A, TMP);
442      TMP0B = vaddq_u32(MSG0B, TMP);
443      TMP2A = STATE0A;
444      TMP2B = STATE0B;
445      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
446      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
447      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
448      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
449  
450      // Transform 1: Rounds 53-56
451      TMP = vld1q_u32(&K[52]);
452      TMP0A = vaddq_u32(MSG1A, TMP);
453      TMP0B = vaddq_u32(MSG1B, TMP);
454      TMP2A = STATE0A;
455      TMP2B = STATE0B;
456      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
457      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
458      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
459      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
460  
461      // Transform 1: Rounds 57-60
462      TMP = vld1q_u32(&K[56]);
463      TMP0A = vaddq_u32(MSG2A, TMP);
464      TMP0B = vaddq_u32(MSG2B, TMP);
465      TMP2A = STATE0A;
466      TMP2B = STATE0B;
467      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
468      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
469      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
470      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
471  
472      // Transform 1: Rounds 61-64
473      TMP = vld1q_u32(&K[60]);
474      TMP0A = vaddq_u32(MSG3A, TMP);
475      TMP0B = vaddq_u32(MSG3B, TMP);
476      TMP2A = STATE0A;
477      TMP2B = STATE0B;
478      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
479      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
480      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
481      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
482  
483      // Transform 1: Update state
484      TMP = vld1q_u32(&INIT[0]);
485      STATE0A = vaddq_u32(STATE0A, TMP);
486      STATE0B = vaddq_u32(STATE0B, TMP);
487      TMP = vld1q_u32(&INIT[4]);
488      STATE1A = vaddq_u32(STATE1A, TMP);
489      STATE1B = vaddq_u32(STATE1B, TMP);
490  
491      // Transform 2: Save state
492      ABEF_SAVEA = STATE0A;
493      ABEF_SAVEB = STATE0B;
494      CDGH_SAVEA = STATE1A;
495      CDGH_SAVEB = STATE1B;
496  
497      // Transform 2: Rounds 1-4
498      TMP = vld1q_u32(&MIDS[0]);
499      TMP2A = STATE0A;
500      TMP2B = STATE0B;
501      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
502      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
503      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
504      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
505  
506      // Transform 2: Rounds 5-8
507      TMP = vld1q_u32(&MIDS[4]);
508      TMP2A = STATE0A;
509      TMP2B = STATE0B;
510      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
511      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
512      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
513      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
514  
515      // Transform 2: Rounds 9-12
516      TMP = vld1q_u32(&MIDS[8]);
517      TMP2A = STATE0A;
518      TMP2B = STATE0B;
519      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
520      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
521      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
522      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
523  
524      // Transform 2: Rounds 13-16
525      TMP = vld1q_u32(&MIDS[12]);
526      TMP2A = STATE0A;
527      TMP2B = STATE0B;
528      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
529      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
530      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
531      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
532  
533      // Transform 2: Rounds 17-20
534      TMP = vld1q_u32(&MIDS[16]);
535      TMP2A = STATE0A;
536      TMP2B = STATE0B;
537      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
538      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
539      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
540      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
541  
542      // Transform 2: Rounds 21-24
543      TMP = vld1q_u32(&MIDS[20]);
544      TMP2A = STATE0A;
545      TMP2B = STATE0B;
546      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
547      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
548      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
549      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
550  
551      // Transform 2: Rounds 25-28
552      TMP = vld1q_u32(&MIDS[24]);
553      TMP2A = STATE0A;
554      TMP2B = STATE0B;
555      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
556      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
557      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
558      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
559  
560      // Transform 2: Rounds 29-32
561      TMP = vld1q_u32(&MIDS[28]);
562      TMP2A = STATE0A;
563      TMP2B = STATE0B;
564      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
565      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
566      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
567      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
568  
569      // Transform 2: Rounds 33-36
570      TMP = vld1q_u32(&MIDS[32]);
571      TMP2A = STATE0A;
572      TMP2B = STATE0B;
573      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
574      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
575      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
576      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
577  
578      // Transform 2: Rounds 37-40
579      TMP = vld1q_u32(&MIDS[36]);
580      TMP2A = STATE0A;
581      TMP2B = STATE0B;
582      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
583      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
584      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
585      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
586  
587      // Transform 2: Rounds 41-44
588      TMP = vld1q_u32(&MIDS[40]);
589      TMP2A = STATE0A;
590      TMP2B = STATE0B;
591      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
592      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
593      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
594      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
595  
596      // Transform 2: Rounds 45-48
597      TMP = vld1q_u32(&MIDS[44]);
598      TMP2A = STATE0A;
599      TMP2B = STATE0B;
600      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
601      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
602      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
603      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
604  
605      // Transform 2: Rounds 49-52
606      TMP = vld1q_u32(&MIDS[48]);
607      TMP2A = STATE0A;
608      TMP2B = STATE0B;
609      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
610      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
611      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
612      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
613  
614      // Transform 2: Rounds 53-56
615      TMP = vld1q_u32(&MIDS[52]);
616      TMP2A = STATE0A;
617      TMP2B = STATE0B;
618      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
619      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
620      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
621      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
622  
623      // Transform 2: Rounds 57-60
624      TMP = vld1q_u32(&MIDS[56]);
625      TMP2A = STATE0A;
626      TMP2B = STATE0B;
627      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
628      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
629      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
630      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
631  
632      // Transform 2: Rounds 61-64
633      TMP = vld1q_u32(&MIDS[60]);
634      TMP2A = STATE0A;
635      TMP2B = STATE0B;
636      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
637      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
638      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
639      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
640  
641      // Transform 2: Update state
642      STATE0A = vaddq_u32(STATE0A, ABEF_SAVEA);
643      STATE0B = vaddq_u32(STATE0B, ABEF_SAVEB);
644      STATE1A = vaddq_u32(STATE1A, CDGH_SAVEA);
645      STATE1B = vaddq_u32(STATE1B, CDGH_SAVEB);
646  
647      // Transform 3: Pad previous output
648      MSG0A = STATE0A;
649      MSG0B = STATE0B;
650      MSG1A = STATE1A;
651      MSG1B = STATE1B;
652      MSG2A = vld1q_u32(&FINAL[0]);
653      MSG2B = MSG2A;
654      MSG3A = vld1q_u32(&FINAL[4]);
655      MSG3B = MSG3A;
656  
657      // Transform 3: Load state
658      STATE0A = vld1q_u32(&INIT[0]);
659      STATE0B = STATE0A;
660      STATE1A = vld1q_u32(&INIT[4]);
661      STATE1B = STATE1A;
662  
663      // Transform 3: Rounds 1-4
664      TMP = vld1q_u32(&K[0]);
665      TMP0A = vaddq_u32(MSG0A, TMP);
666      TMP0B = vaddq_u32(MSG0B, TMP);
667      TMP2A = STATE0A;
668      TMP2B = STATE0B;
669      MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
670      MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
671      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
672      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
673      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
674      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
675      MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
676      MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
677  
678      // Transform 3: Rounds 5-8
679      TMP = vld1q_u32(&K[4]);
680      TMP0A = vaddq_u32(MSG1A, TMP);
681      TMP0B = vaddq_u32(MSG1B, TMP);
682      TMP2A = STATE0A;
683      TMP2B = STATE0B;
684      MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
685      MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
686      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
687      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
688      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
689      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
690      MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
691      MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
692  
693      // Transform 3: Rounds 9-12
694      TMP = vld1q_u32(&FINS[0]);
695      TMP2A = STATE0A;
696      TMP2B = STATE0B;
697      MSG2A = vld1q_u32(&FINS[4]);
698      MSG2B = MSG2A;
699      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
700      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
701      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
702      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
703      MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
704      MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
705  
706      // Transform 3: Rounds 13-16
707      TMP = vld1q_u32(&FINS[8]);
708      TMP2A = STATE0A;
709      TMP2B = STATE0B;
710      MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
711      MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
712      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
713      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
714      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
715      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
716      MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
717      MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
718  
719      // Transform 3: Rounds 17-20
720      TMP = vld1q_u32(&K[16]);
721      TMP0A = vaddq_u32(MSG0A, TMP);
722      TMP0B = vaddq_u32(MSG0B, TMP);
723      TMP2A = STATE0A;
724      TMP2B = STATE0B;
725      MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
726      MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
727      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
728      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
729      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
730      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
731      MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
732      MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
733  
734      // Transform 3: Rounds 21-24
735      TMP = vld1q_u32(&K[20]);
736      TMP0A = vaddq_u32(MSG1A, TMP);
737      TMP0B = vaddq_u32(MSG1B, TMP);
738      TMP2A = STATE0A;
739      TMP2B = STATE0B;
740      MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
741      MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
742      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
743      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
744      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
745      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
746      MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
747      MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
748  
749      // Transform 3: Rounds 25-28
750      TMP = vld1q_u32(&K[24]);
751      TMP0A = vaddq_u32(MSG2A, TMP);
752      TMP0B = vaddq_u32(MSG2B, TMP);
753      TMP2A = STATE0A;
754      TMP2B = STATE0B;
755      MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
756      MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
757      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
758      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
759      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
760      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
761      MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
762      MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
763  
764      // Transform 3: Rounds 29-32
765      TMP = vld1q_u32(&K[28]);
766      TMP0A = vaddq_u32(MSG3A, TMP);
767      TMP0B = vaddq_u32(MSG3B, TMP);
768      TMP2A = STATE0A;
769      TMP2B = STATE0B;
770      MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
771      MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
772      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
773      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
774      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
775      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
776      MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
777      MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
778  
779      // Transform 3: Rounds 33-36
780      TMP = vld1q_u32(&K[32]);
781      TMP0A = vaddq_u32(MSG0A, TMP);
782      TMP0B = vaddq_u32(MSG0B, TMP);
783      TMP2A = STATE0A;
784      TMP2B = STATE0B;
785      MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
786      MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
787      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
788      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
789      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
790      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
791      MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
792      MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
793  
794      // Transform 3: Rounds 37-40
795      TMP = vld1q_u32(&K[36]);
796      TMP0A = vaddq_u32(MSG1A, TMP);
797      TMP0B = vaddq_u32(MSG1B, TMP);
798      TMP2A = STATE0A;
799      TMP2B = STATE0B;
800      MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
801      MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
802      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
803      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
804      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
805      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
806      MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
807      MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
808  
809      // Transform 3: Rounds 41-44
810      TMP = vld1q_u32(&K[40]);
811      TMP0A = vaddq_u32(MSG2A, TMP);
812      TMP0B = vaddq_u32(MSG2B, TMP);
813      TMP2A = STATE0A;
814      TMP2B = STATE0B;
815      MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
816      MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
817      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
818      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
819      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
820      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
821      MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
822      MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
823  
824      // Transform 3: Rounds 45-48
825      TMP = vld1q_u32(&K[44]);
826      TMP0A = vaddq_u32(MSG3A, TMP);
827      TMP0B = vaddq_u32(MSG3B, TMP);
828      TMP2A = STATE0A;
829      TMP2B = STATE0B;
830      MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
831      MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
832      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
833      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
834      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
835      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
836      MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
837      MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
838  
839      // Transform 3: Rounds 49-52
840      TMP = vld1q_u32(&K[48]);
841      TMP0A = vaddq_u32(MSG0A, TMP);
842      TMP0B = vaddq_u32(MSG0B, TMP);
843      TMP2A = STATE0A;
844      TMP2B = STATE0B;
845      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
846      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
847      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
848      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
849  
850      // Transform 3: Rounds 53-56
851      TMP = vld1q_u32(&K[52]);
852      TMP0A = vaddq_u32(MSG1A, TMP);
853      TMP0B = vaddq_u32(MSG1B, TMP);
854      TMP2A = STATE0A;
855      TMP2B = STATE0B;
856      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
857      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
858      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
859      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
860  
861      // Transform 3: Rounds 57-60
862      TMP = vld1q_u32(&K[56]);
863      TMP0A = vaddq_u32(MSG2A, TMP);
864      TMP0B = vaddq_u32(MSG2B, TMP);
865      TMP2A = STATE0A;
866      TMP2B = STATE0B;
867      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
868      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
869      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
870      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
871  
872      // Transform 3: Rounds 61-64
873      TMP = vld1q_u32(&K[60]);
874      TMP0A = vaddq_u32(MSG3A, TMP);
875      TMP0B = vaddq_u32(MSG3B, TMP);
876      TMP2A = STATE0A;
877      TMP2B = STATE0B;
878      STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
879      STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
880      STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
881      STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
882  
883      // Transform 3: Update state
884      TMP = vld1q_u32(&INIT[0]);
885      STATE0A = vaddq_u32(STATE0A, TMP);
886      STATE0B = vaddq_u32(STATE0B, TMP);
887      TMP = vld1q_u32(&INIT[4]);
888      STATE1A = vaddq_u32(STATE1A, TMP);
889      STATE1B = vaddq_u32(STATE1B, TMP);
890  
891      // Store result
892      vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
893      vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
894      vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
895      vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
896  }
897  }
898  
899  #endif