/ external / libucl / src / mum.h
mum.h
  1  /* Copyright (c) 2016 Vladimir Makarov <vmakarov@gcc.gnu.org>
  2  
  3     Permission is hereby granted, free of charge, to any person
  4     obtaining a copy of this software and associated documentation
  5     files (the "Software"), to deal in the Software without
  6     restriction, including without limitation the rights to use, copy,
  7     modify, merge, publish, distribute, sublicense, and/or sell copies
  8     of the Software, and to permit persons to whom the Software is
  9     furnished to do so, subject to the following conditions:
 10  
 11     The above copyright notice and this permission notice shall be
 12     included in all copies or substantial portions of the Software.
 13  
 14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 15     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 16     MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 17     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 18     BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 19     ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 20     CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21     SOFTWARE.
 22  */
 23  
 24  /* This file implements MUM (MUltiply and Mix) hashing.  We randomize
 25     input data by 64x64-bit multiplication and mixing hi- and low-parts
 26     of the multiplication result by using an addition and then mix it
 27     into the current state.  We use prime numbers randomly generated
 28     with the equal probability of their bit values for the
 29     multiplication.  When all primes are used once, the state is
 30     randomized and the same prime numbers are used again for data
 31     randomization.
 32  
 33     The MUM hashing passes all SMHasher tests.  Pseudo Random Number
 34     Generator based on MUM also passes NIST Statistical Test Suite for
 35     Random and Pseudorandom Number Generators for Cryptographic
 36     Applications (version 2.2.1) with 1000 bitstreams each containing
 37     1M bits.  MUM hashing is also faster Spooky64 and City64 on small
 38     strings (at least up to 512-bit) on Haswell and Power7.  The MUM bulk
 39     speed (speed on very long data) is bigger than Spooky and City on
 40     Power7.  On Haswell the bulk speed is bigger than Spooky one and
 41     close to City speed.  */
 42  
 43  #ifndef __MUM_HASH__
 44  #define __MUM_HASH__
 45  
 46  #include <stddef.h>
 47  #include <stdlib.h>
 48  #include <string.h>
 49  #include <limits.h>
 50  
 51  #ifdef _MSC_VER
 52  typedef unsigned __int16 uint16_t;
 53  typedef unsigned __int32 uint32_t;
 54  typedef unsigned __int64 uint64_t;
 55  #else
 56  #include <stdint.h>
 57  #endif
 58  
 59  /* Macro saying to use 128-bit integers implemented by GCC for some
 60     targets.  */
 61  #ifndef _MUM_USE_INT128
 62  /* In GCC uint128_t is defined if HOST_BITS_PER_WIDE_INT >= 64.
 63     HOST_WIDE_INT is long if HOST_BITS_PER_LONG > HOST_BITS_PER_INT,
 64     otherwise int. */
 65  #if defined(__GNUC__) && UINT_MAX != ULONG_MAX
 66  #define _MUM_USE_INT128 1
 67  #else
 68  #define _MUM_USE_INT128 0
 69  #endif
 70  #endif
 71  
 72  #if defined(__GNUC__) && ((__GNUC__ == 4) &&  (__GNUC_MINOR__ >= 9) || (__GNUC__ > 4))
 73  #define _MUM_FRESH_GCC
 74  #endif
 75  
 76  #if defined(__GNUC__) && !defined(__llvm__) && defined(_MUM_FRESH_GCC)
 77  #define _MUM_ATTRIBUTE_UNUSED  __attribute__((unused))
 78  #define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts)))
 79  #define _MUM_TARGET(opts) __attribute__((__target__ (opts)))
 80  #else
 81  #define _MUM_ATTRIBUTE_UNUSED
 82  #define _MUM_OPTIMIZE(opts)
 83  #define _MUM_TARGET(opts)
 84  #endif
 85  
 86  
 87  /* Here are different primes randomly generated with the equal
 88     probability of their bit values.  They are used to randomize input
 89     values.  */
 90  static uint64_t _mum_hash_step_prime = 0x2e0bb864e9ea7df5ULL;
 91  static uint64_t _mum_key_step_prime = 0xcdb32970830fcaa1ULL;
 92  static uint64_t _mum_block_start_prime = 0xc42b5e2e6480b23bULL;
 93  static uint64_t _mum_unroll_prime = 0x7b51ec3d22f7096fULL;
 94  static uint64_t _mum_tail_prime = 0xaf47d47c99b1461bULL;
 95  static uint64_t _mum_finish_prime1 = 0xa9a7ae7ceff79f3fULL;
 96  static uint64_t _mum_finish_prime2 = 0xaf47d47c99b1461bULL;
 97  
 98  static uint64_t _mum_primes [] = {
 99    0X9ebdcae10d981691, 0X32b9b9b97a27ac7d, 0X29b5584d83d35bbd, 0X4b04e0e61401255f,
100    0X25e8f7b1f1c9d027, 0X80d4c8c000f3e881, 0Xbd1255431904b9dd, 0X8a3bd4485eee6d81,
101    0X3bc721b2aad05197, 0X71b1a19b907d6e33, 0X525e6c1084a8534b, 0X9e4c2cd340c1299f,
102    0Xde3add92e94caa37, 0X7e14eadb1f65311d, 0X3f5aa40f89812853, 0X33b15a3b587d15c9,
103  };
104  
105  /* Multiply 64-bit V and P and return sum of high and low parts of the
106     result.  */
107  static inline uint64_t
108  _mum (uint64_t v, uint64_t p) {
109    uint64_t hi, lo;
110  #if _MUM_USE_INT128
111  #if defined(__aarch64__)
112    /* AARCH64 needs 2 insns to calculate 128-bit result of the
113       multiplication.  If we use a generic code we actually call a
114       function doing 128x128->128 bit multiplication.  The function is
115       very slow.  */
116    lo = v * p, hi;
117    asm ("umulh %0, %1, %2" : "=r" (hi) : "r" (v), "r" (p));
118  #else
119    __uint128_t r = (__uint128_t) v * (__uint128_t) p;
120    hi = (uint64_t) (r >> 64);
121    lo = (uint64_t) r;
122  #endif
123  #else
124    /* Implementation of 64x64->128-bit multiplication by four 32x32->64
125       bit multiplication.  */
126    uint64_t hv = v >> 32, hp = p >> 32;
127    uint64_t lv = (uint32_t) v, lp = (uint32_t) p;
128    uint64_t rh =  hv * hp;
129    uint64_t rm_0 = hv * lp;
130    uint64_t rm_1 = hp * lv;
131    uint64_t rl =  lv * lp;
132    uint64_t t, carry = 0;
133  
134    /* We could ignore a carry bit here if we did not care about the
135       same hash for 32-bit and 64-bit targets.  */
136    t = rl + (rm_0 << 32);
137  #ifdef MUM_TARGET_INDEPENDENT_HASH
138    carry = t < rl;
139  #endif
140    lo = t + (rm_1 << 32);
141  #ifdef MUM_TARGET_INDEPENDENT_HASH
142    carry += lo < t;
143  #endif
144    hi = rh + (rm_0 >> 32) + (rm_1 >> 32) + carry;
145  #endif
146    /* We could use XOR here too but, for some reasons, on Haswell and
147       Power7 using an addition improves hashing performance by 10% for
148       small strings.  */
149    return hi + lo;
150  }
151  
152  #if defined(_MSC_VER)
153  #define _mum_bswap_32(x) _byteswap_uint32_t (x)
154  #define _mum_bswap_64(x) _byteswap_uint64_t (x)
155  #elif defined(__APPLE__)
156  #include <libkern/OSByteOrder.h>
157  #define _mum_bswap_32(x) OSSwapInt32 (x)
158  #define _mum_bswap_64(x) OSSwapInt64 (x)
159  #elif defined(__GNUC__)
160  #define _mum_bswap32(x) __builtin_bswap32 (x)
161  #define _mum_bswap64(x) __builtin_bswap64 (x)
162  #else
163  #include <byteswap.h>
164  #define _mum_bswap32(x) bswap32 (x)
165  #define _mum_bswap64(x) bswap64 (x)
166  #endif
167  
168  static inline uint64_t
169  _mum_le (uint64_t v) {
170  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
171    return v;
172  #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
173    return _mum_bswap64 (v);
174  #else
175  #error "Unknown endianness"
176  #endif
177  }
178  
179  static inline uint32_t
180  _mum_le32 (uint32_t v) {
181  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
182    return v;
183  #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
184    return _mum_bswap32 (v);
185  #else
186  #error "Unknown endianness"
187  #endif
188  }
189  
190  /* Macro defining how many times the most nested loop in
191     _mum_hash_aligned will be unrolled by the compiler (although it can
192     make an own decision:).  Use only a constant here to help a
193     compiler to unroll a major loop.
194  
195     The macro value affects the result hash for strings > 128 bit.  The
196     unroll factor greatly affects the hashing speed.  We prefer the
197     speed.  */
198  #ifndef _MUM_UNROLL_FACTOR_POWER
199  #if defined(__PPC64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
200  #define _MUM_UNROLL_FACTOR_POWER 3
201  #elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
202  #define _MUM_UNROLL_FACTOR_POWER 4
203  #else
204  #define _MUM_UNROLL_FACTOR_POWER 2
205  #endif
206  #endif
207  
208  #if _MUM_UNROLL_FACTOR_POWER < 1
209  #error "too small unroll factor"
210  #elif _MUM_UNROLL_FACTOR_POWER > 4
211  #error "We have not enough primes for such unroll factor"
212  #endif
213  
214  #define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER)
215  
216  static inline uint64_t _MUM_OPTIMIZE("unroll-loops")
217  _mum_hash_aligned (uint64_t start, const void *key, size_t len) {
218    uint64_t result = start;
219    const unsigned char *str = (const unsigned char *) key;
220    uint64_t u64;
221    int i;
222    size_t n;
223  
224    result = _mum (result, _mum_block_start_prime);
225    while  (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) {
226      /* This loop could be vectorized when we have vector insns for
227         64x64->128-bit multiplication.  AVX2 currently only have a
228         vector insn for 4 32x32->64-bit multiplication.  */
229      for (i = 0; i < _MUM_UNROLL_FACTOR; i++)
230        result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
231      len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t);
232      str += _MUM_UNROLL_FACTOR * sizeof (uint64_t);
233      /* We will use the same prime numbers on the next iterations --
234         randomize the state.  */
235      result = _mum (result, _mum_unroll_prime);
236    }
237    n = len / sizeof (uint64_t);
238    for (i = 0; i < (int)n; i++)
239      result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
240    len -= n * sizeof (uint64_t); str += n * sizeof (uint64_t);
241    switch (len) {
242    case 7:
243      u64 = _mum_le32 (*(uint32_t *) str);
244      u64 |= (uint64_t) str[4] << 32;
245      u64 |= (uint64_t) str[5] << 40;
246      u64 |= (uint64_t) str[6] << 48;
247      return result ^ _mum (u64, _mum_tail_prime);
248    case 6:
249      u64 = _mum_le32 (*(uint32_t *) str);
250      u64 |= (uint64_t) str[4] << 32;
251      u64 |= (uint64_t) str[5] << 40;
252      return result ^ _mum (u64, _mum_tail_prime);
253    case 5:
254      u64 = _mum_le32 (*(uint32_t *) str);
255      u64 |= (uint64_t) str[4] << 32;
256      return result ^ _mum (u64, _mum_tail_prime);
257    case 4:
258      u64 = _mum_le32 (*(uint32_t *) str);
259      return result ^ _mum (u64, _mum_tail_prime);
260    case 3:
261      u64 = str[0];
262      u64 |= (uint64_t) str[1] << 8;
263      u64 |= (uint64_t) str[2] << 16;
264      return result ^ _mum (u64, _mum_tail_prime);
265    case 2:
266      u64 = str[0];
267      u64 |= (uint64_t) str[1] << 8;
268      return result ^ _mum (u64, _mum_tail_prime);
269    case 1:
270      u64 = str[0];
271      return result ^ _mum (u64, _mum_tail_prime);
272    }
273    return result;
274  }
275  
276  /* Final randomization of H.  */
277  static inline uint64_t
278  _mum_final (uint64_t h) {
279    h ^= _mum (h, _mum_finish_prime1);
280    h ^= _mum (h, _mum_finish_prime2);
281    return h;
282  }
283  
284  #if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
285  
286  /* We want to use AVX2 insn MULX instead of generic x86-64 MULQ where
287     it is possible.  Although on modern Intel processors MULQ takes
288     3-cycles vs. 4 for MULX, MULX permits more freedom in insn
289     scheduling as it uses less fixed registers.  */
290  static inline uint64_t _MUM_TARGET("arch=haswell")
291  _mum_hash_avx2 (const void * key, size_t len, uint64_t seed) {
292    return _mum_final (_mum_hash_aligned (seed + len, key, len));
293  }
294  #endif
295  
296  #ifndef _MUM_UNALIGNED_ACCESS
297  #if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) \
298      || defined(__s390__) || defined(__m32c__) || defined(cris)     \
299      || defined(__CR16__) || defined(__vax__) || defined(__m68k__) \
300      || defined(__aarch64__)
301  #define _MUM_UNALIGNED_ACCESS 1
302  #else
303  #define _MUM_UNALIGNED_ACCESS 0
304  #endif
305  #endif
306  
307  /* When we need an aligned access to data being hashed we move part of
308     the unaligned data to an aligned block of given size and then
309     process it, repeating processing the data by the block.  */
310  #ifndef _MUM_BLOCK_LEN
311  #define _MUM_BLOCK_LEN 1024
312  #endif
313  
314  #if _MUM_BLOCK_LEN < 8
315  #error "too small block length"
316  #endif
317  
318  static inline uint64_t
319  #if defined(__x86_64__)
320  _MUM_TARGET("inline-all-stringops")
321  #endif
322  _mum_hash_default (const void *key, size_t len, uint64_t seed) {
323    uint64_t result;
324    const unsigned char *str = (const unsigned char *) key;
325    size_t block_len;
326    uint64_t buf[_MUM_BLOCK_LEN / sizeof (uint64_t)];
327  
328    result = seed + len;
329    if (_MUM_UNALIGNED_ACCESS || ((size_t) str & 0x7) == 0)
330      result = _mum_hash_aligned (result, key, len);
331    else {
332      while (len != 0) {
333        block_len = len < _MUM_BLOCK_LEN ? len : _MUM_BLOCK_LEN;
334        memmove (buf, str, block_len);
335        result = _mum_hash_aligned (result, buf, block_len);
336        len -= block_len;
337        str += block_len;
338      }
339    }
340    return _mum_final (result);
341  }
342  
343  static inline uint64_t
344  _mum_next_factor (void) {
345    uint64_t start = 0;
346    int i;
347  
348    for (i = 0; i < 8; i++)
349      start = (start << 8) | rand() % 256;
350    return start;
351  }
352  
353  /* ++++++++++++++++++++++++++ Interface functions: +++++++++++++++++++  */
354  
355  /* Set random multiplicators depending on SEED.  */
356  static inline void
357  mum_hash_randomize (uint64_t seed) {
358    int i;
359  
360    srand (seed);
361    _mum_hash_step_prime = _mum_next_factor ();
362    _mum_key_step_prime = _mum_next_factor ();
363    _mum_finish_prime1 = _mum_next_factor ();
364    _mum_finish_prime2 = _mum_next_factor ();
365    _mum_block_start_prime = _mum_next_factor ();
366    _mum_unroll_prime = _mum_next_factor ();
367    _mum_tail_prime = _mum_next_factor ();
368    for (i = 0; i < (int)(sizeof (_mum_primes) / sizeof (uint64_t)); i++)
369      _mum_primes[i] = _mum_next_factor ();
370  }
371  
372  /* Start hashing data with SEED.  Return the state.  */
373  static inline uint64_t
374  mum_hash_init (uint64_t seed) {
375    return seed;
376  }
377  
378  /* Process data KEY with the state H and return the updated state.  */
379  static inline uint64_t
380  mum_hash_step (uint64_t h, uint64_t key)
381  {
382    return _mum (h, _mum_hash_step_prime) ^ _mum (key, _mum_key_step_prime);
383  }
384  
385  /* Return the result of hashing using the current state H.  */
386  static inline uint64_t
387  mum_hash_finish (uint64_t h) {
388    return _mum_final (h);
389  }
390  
391  /* Fast hashing of KEY with SEED.  The hash is always the same for the
392     same key on any target. */
393  static inline size_t
394  mum_hash64 (uint64_t key, uint64_t seed) {
395    return mum_hash_finish (mum_hash_step (mum_hash_init (seed), key));
396  }
397  
398  /* Hash data KEY of length LEN and SEED.  The hash depends on the
399     target endianness and the unroll factor.  */
400  static inline uint64_t
401  mum_hash (const void *key, size_t len, uint64_t seed) {
402  #if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
403    static int avx2_support = 0;
404  
405    if (avx2_support > 0)
406      return _mum_hash_avx2 (key, len, seed);
407    else if (! avx2_support) {
408      __builtin_cpu_init ();
409      avx2_support =  __builtin_cpu_supports ("avx2") ? 1 : -1;
410      if (avx2_support > 0)
411        return _mum_hash_avx2 (key, len, seed);
412    }
413  #endif
414    return _mum_hash_default (key, len, seed);
415  }
416  
417  #endif