/ src / hash / ghash_pclmul.c
ghash_pclmul.c
  1  /*
  2   * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
  3   *
  4   * Permission is hereby granted, free of charge, to any person obtaining 
  5   * a copy of this software and associated documentation files (the
  6   * "Software"), to deal in the Software without restriction, including
  7   * without limitation the rights to use, copy, modify, merge, publish,
  8   * distribute, sublicense, and/or sell copies of the Software, and to
  9   * permit persons to whom the Software is furnished to do so, subject to
 10   * the following conditions:
 11   *
 12   * The above copyright notice and this permission notice shall be 
 13   * included in all copies or substantial portions of the Software.
 14   *
 15   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 16   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
 18   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 19   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 20   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 21   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22   * SOFTWARE.
 23   */
 24  
 25  #define BR_ENABLE_INTRINSICS   1
 26  #include "inner.h"
 27  
 28  /*
 29   * This is the GHASH implementation that leverages the pclmulqdq opcode
 30   * (from the AES-NI instructions).
 31   */
 32  
 33  #if BR_AES_X86NI
 34  
 35  /*
 36   * Test CPU support for PCLMULQDQ.
 37   */
 38  static inline int
 39  pclmul_supported(void)
 40  {
 41  	/*
 42  	 * Bit mask for features in ECX:
 43  	 *    1   PCLMULQDQ support
 44  	 */
 45  	return br_cpuid(0, 0, 0x00000002, 0);
 46  }
 47  
 48  /* see bearssl_hash.h */
 49  br_ghash
 50  br_ghash_pclmul_get(void)
 51  {
 52  	return pclmul_supported() ? &br_ghash_pclmul : 0;
 53  }
 54  
 55  BR_TARGETS_X86_UP
 56  
 57  /*
 58   * GHASH is defined over elements of GF(2^128) with "full little-endian"
 59   * representation: leftmost byte is least significant, and, within each
 60   * byte, leftmost _bit_ is least significant. The natural ordering in
 61   * x86 is "mixed little-endian": bytes are ordered from least to most
 62   * significant, but bits within a byte are in most-to-least significant
 63   * order. Going to full little-endian representation would require
 64   * reversing bits within each byte, which is doable but expensive.
 65   *
 66   * Instead, we go to full big-endian representation, by swapping bytes
 67   * around, which is done with a single _mm_shuffle_epi8() opcode (it
 68   * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
 69   * can use a full big-endian representation because in a carryless
 70   * multiplication, we have a nice bit reversal property:
 71   *
 72   *    rev_128(x) * rev_128(y) = rev_255(x * y)
 73   *
 74   * So by using full big-endian, we still get the right result, except
 75   * that it is right-shifted by 1 bit. The left-shift is relatively
 76   * inexpensive, and it can be mutualised.
 77   *
 78   *
 79   * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
 80   * values with bit precision, we have to break down values into 64-bit
 81   * chunks. We number chunks from 0 to 3 in left to right order.
 82   */
 83  
 84  /*
 85   * Byte-swap a complete 128-bit value. This normally uses
 86   * _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode).
 87   * However, this crashes old Clang versions, so, for Clang before 3.8,
 88   * we use an alternate (and less efficient) version.
 89   */
 90  #if BR_CLANG && !BR_CLANG_3_8
 91  #define BYTESWAP_DECL
 92  #define BYTESWAP_PREP   (void)0
 93  #define BYTESWAP(x)   do { \
 94  		__m128i byteswap1, byteswap2; \
 95  		byteswap1 = (x); \
 96  		byteswap2 = _mm_srli_epi16(byteswap1, 8); \
 97  		byteswap1 = _mm_slli_epi16(byteswap1, 8); \
 98  		byteswap1 = _mm_or_si128(byteswap1, byteswap2); \
 99  		byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \
100  		byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \
101  		(x) = _mm_shuffle_epi32(byteswap1, 0x4E); \
102  	} while (0)
103  #else
104  #define BYTESWAP_DECL   __m128i byteswap_index;
105  #define BYTESWAP_PREP   do { \
106  		byteswap_index = _mm_set_epi8( \
107  			0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
108  	} while (0)
109  #define BYTESWAP(x)   do { \
110  		(x) = _mm_shuffle_epi8((x), byteswap_index); \
111  	} while (0)
112  #endif
113  
114  /*
115   * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
116   * for that compiler, we use inline assembly. Inline assembly is
117   * potentially a bit slower because the compiler does not understand
118   * what the opcode does, and thus cannot optimize instruction
119   * scheduling.
120   *
121   * We use a target of "sse2" only, so that Clang may still handle the
122   * '__m128i' type and allocate SSE2 registers.
123   */
124  #if BR_CLANG
125  BR_TARGET("sse2")
126  static inline __m128i
127  pclmulqdq00(__m128i x, __m128i y)
128  {
129  	__asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y));
130  	return x;
131  }
132  BR_TARGET("sse2")
133  static inline __m128i
134  pclmulqdq11(__m128i x, __m128i y)
135  {
136  	__asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y));
137  	return x;
138  }
139  #else
140  #define pclmulqdq00(x, y)   _mm_clmulepi64_si128(x, y, 0x00)
141  #define pclmulqdq11(x, y)   _mm_clmulepi64_si128(x, y, 0x11)
142  #endif
143  
144  /*
145   * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
146   * halves of kw (into the right half of kx; left half is unspecified).
147   */
148  #define BK(kw, kx)   do { \
149  		kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
150  	} while (0)
151  
152  /*
153   * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
154   * the XOR of the two values (kx).
155   */
156  #define PBK(k0, k1, kw, kx)   do { \
157  		kw = _mm_unpacklo_epi64(k1, k0); \
158  		kx = _mm_xor_si128(k0, k1); \
159  	} while (0)
160  
161  /*
162   * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
163   */
164  #define SL_256(x0, x1, x2, x3)   do { \
165  		x0 = _mm_or_si128( \
166  			_mm_slli_epi64(x0, 1), \
167  			_mm_srli_epi64(x1, 63)); \
168  		x1 = _mm_or_si128( \
169  			_mm_slli_epi64(x1, 1), \
170  			_mm_srli_epi64(x2, 63)); \
171  		x2 = _mm_or_si128( \
172  			_mm_slli_epi64(x2, 1), \
173  			_mm_srli_epi64(x3, 63)); \
174  		x3 = _mm_slli_epi64(x3, 1); \
175  	} while (0)
176  
177  /*
178   * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
179   * result is written in x0..x1.
180   */
181  #define REDUCE_F128(x0, x1, x2, x3)   do { \
182  		x1 = _mm_xor_si128( \
183  			x1, \
184  			_mm_xor_si128( \
185  				_mm_xor_si128( \
186  					x3, \
187  					_mm_srli_epi64(x3, 1)), \
188  				_mm_xor_si128( \
189  					_mm_srli_epi64(x3, 2), \
190  					_mm_srli_epi64(x3, 7)))); \
191  		x2 = _mm_xor_si128( \
192  			_mm_xor_si128( \
193  				x2, \
194  				_mm_slli_epi64(x3, 63)), \
195  			_mm_xor_si128( \
196  				_mm_slli_epi64(x3, 62), \
197  				_mm_slli_epi64(x3, 57))); \
198  		x0 = _mm_xor_si128( \
199  			x0, \
200  			_mm_xor_si128( \
201  				_mm_xor_si128( \
202  					x2, \
203  					_mm_srli_epi64(x2, 1)), \
204  				_mm_xor_si128( \
205  					_mm_srli_epi64(x2, 2), \
206  					_mm_srli_epi64(x2, 7)))); \
207  		x1 = _mm_xor_si128( \
208  			_mm_xor_si128( \
209  				x1, \
210  				_mm_slli_epi64(x2, 63)), \
211  			_mm_xor_si128( \
212  				_mm_slli_epi64(x2, 62), \
213  				_mm_slli_epi64(x2, 57))); \
214  	} while (0)
215  
216  /*
217   * Square value kw into (dw,dx).
218   */
219  #define SQUARE_F128(kw, dw, dx)   do { \
220  		__m128i z0, z1, z2, z3; \
221  		z1 = pclmulqdq11(kw, kw); \
222  		z3 = pclmulqdq00(kw, kw); \
223  		z0 = _mm_shuffle_epi32(z1, 0x0E); \
224  		z2 = _mm_shuffle_epi32(z3, 0x0E); \
225  		SL_256(z0, z1, z2, z3); \
226  		REDUCE_F128(z0, z1, z2, z3); \
227  		PBK(z0, z1, dw, dx); \
228  	} while (0)
229  
230  /* see bearssl_hash.h */
231  BR_TARGET("ssse3,pclmul")
232  void
233  br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
234  {
235  	const unsigned char *buf1, *buf2;
236  	unsigned char tmp[64];
237  	size_t num4, num1;
238  	__m128i yw, h1w, h1x;
239  	BYTESWAP_DECL
240  
241  	/*
242  	 * We split data into two chunks. First chunk starts at buf1
243  	 * and contains num4 blocks of 64-byte values. Second chunk
244  	 * starts at buf2 and contains num1 blocks of 16-byte values.
245  	 * We want the first chunk to be as large as possible.
246  	 */
247  	buf1 = data;
248  	num4 = len >> 6;
249  	len &= 63;
250  	buf2 = buf1 + (num4 << 6);
251  	num1 = (len + 15) >> 4;
252  	if ((len & 15) != 0) {
253  		memcpy(tmp, buf2, len);
254  		memset(tmp + len, 0, (num1 << 4) - len);
255  		buf2 = tmp;
256  	}
257  
258  	/*
259  	 * Preparatory step for endian conversions.
260  	 */
261  	BYTESWAP_PREP;
262  
263  	/*
264  	 * Load y and h.
265  	 */
266  	yw = _mm_loadu_si128(y);
267  	h1w = _mm_loadu_si128(h);
268  	BYTESWAP(yw);
269  	BYTESWAP(h1w);
270  	BK(h1w, h1x);
271  
272  	if (num4 > 0) {
273  		__m128i h2w, h2x, h3w, h3x, h4w, h4x;
274  		__m128i t0, t1, t2, t3;
275  
276  		/*
277  		 * Compute h2 = h^2.
278  		 */
279  		SQUARE_F128(h1w, h2w, h2x);
280  
281  		/*
282  		 * Compute h3 = h^3 = h*(h^2).
283  		 */
284  		t1 = pclmulqdq11(h1w, h2w);
285  		t3 = pclmulqdq00(h1w, h2w);
286  		t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x),
287  			_mm_xor_si128(t1, t3));
288  		t0 = _mm_shuffle_epi32(t1, 0x0E);
289  		t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
290  		t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
291  		SL_256(t0, t1, t2, t3);
292  		REDUCE_F128(t0, t1, t2, t3);
293  		PBK(t0, t1, h3w, h3x);
294  
295  		/*
296  		 * Compute h4 = h^4 = (h^2)^2.
297  		 */
298  		SQUARE_F128(h2w, h4w, h4x);
299  
300  		while (num4 -- > 0) {
301  			__m128i aw0, aw1, aw2, aw3;
302  			__m128i ax0, ax1, ax2, ax3;
303  
304  			aw0 = _mm_loadu_si128((void *)(buf1 +  0));
305  			aw1 = _mm_loadu_si128((void *)(buf1 + 16));
306  			aw2 = _mm_loadu_si128((void *)(buf1 + 32));
307  			aw3 = _mm_loadu_si128((void *)(buf1 + 48));
308  			BYTESWAP(aw0);
309  			BYTESWAP(aw1);
310  			BYTESWAP(aw2);
311  			BYTESWAP(aw3);
312  			buf1 += 64;
313  
314  			aw0 = _mm_xor_si128(aw0, yw);
315  			BK(aw1, ax1);
316  			BK(aw2, ax2);
317  			BK(aw3, ax3);
318  			BK(aw0, ax0);
319  
320  			t1 = _mm_xor_si128(
321  				_mm_xor_si128(
322  					pclmulqdq11(aw0, h4w),
323  					pclmulqdq11(aw1, h3w)),
324  				_mm_xor_si128(
325  					pclmulqdq11(aw2, h2w),
326  					pclmulqdq11(aw3, h1w)));
327  			t3 = _mm_xor_si128(
328  				_mm_xor_si128(
329  					pclmulqdq00(aw0, h4w),
330  					pclmulqdq00(aw1, h3w)),
331  				_mm_xor_si128(
332  					pclmulqdq00(aw2, h2w),
333  					pclmulqdq00(aw3, h1w)));
334  			t2 = _mm_xor_si128(
335  				_mm_xor_si128(
336  					pclmulqdq00(ax0, h4x),
337  					pclmulqdq00(ax1, h3x)),
338  				_mm_xor_si128(
339  					pclmulqdq00(ax2, h2x),
340  					pclmulqdq00(ax3, h1x)));
341  			t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
342  			t0 = _mm_shuffle_epi32(t1, 0x0E);
343  			t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
344  			t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
345  			SL_256(t0, t1, t2, t3);
346  			REDUCE_F128(t0, t1, t2, t3);
347  			yw = _mm_unpacklo_epi64(t1, t0);
348  		}
349  	}
350  
351  	while (num1 -- > 0) {
352  		__m128i aw, ax;
353  		__m128i t0, t1, t2, t3;
354  
355  		aw = _mm_loadu_si128((void *)buf2);
356  		BYTESWAP(aw);
357  		buf2 += 16;
358  
359  		aw = _mm_xor_si128(aw, yw);
360  		BK(aw, ax);
361  
362  		t1 = pclmulqdq11(aw, h1w);
363  		t3 = pclmulqdq00(aw, h1w);
364  		t2 = pclmulqdq00(ax, h1x);
365  		t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
366  		t0 = _mm_shuffle_epi32(t1, 0x0E);
367  		t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
368  		t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
369  		SL_256(t0, t1, t2, t3);
370  		REDUCE_F128(t0, t1, t2, t3);
371  		yw = _mm_unpacklo_epi64(t1, t0);
372  	}
373  
374  	BYTESWAP(yw);
375  	_mm_storeu_si128(y, yw);
376  }
377  
378  BR_TARGETS_X86_DOWN
379  
380  #else
381  
382  /* see bearssl_hash.h */
383  br_ghash
384  br_ghash_pclmul_get(void)
385  {
386  	return 0;
387  }
388  
389  #endif