crc32c_arm64.cc
1 // Copyright 2017 The CRC32C Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 5 #include "./crc32c_arm64.h" 6 7 // In a separate source file to allow this accelerated CRC32C function to be 8 // compiled with the appropriate compiler flags to enable ARM NEON CRC32C 9 // instructions. 10 11 // This implementation is based on https://github.com/google/leveldb/pull/490. 12 13 #include <cstddef> 14 #include <cstdint> 15 #include <cstring> 16 17 #include "./crc32c_internal.h" 18 #ifdef CRC32C_HAVE_CONFIG_H 19 #include "crc32c/crc32c_config.h" 20 #endif 21 22 #if HAVE_ARM64_CRC32C 23 24 #include <arm_acle.h> 25 #include <arm_neon.h> 26 27 #define KBYTES 1032 28 #define SEGMENTBYTES 256 29 30 // compute 8bytes for each segment parallelly 31 #define CRC32C32BYTES(P, IND) \ 32 do { \ 33 std::memcpy(&d64, (P) + SEGMENTBYTES * 1 + (IND) * 8, sizeof(d64)); \ 34 crc1 = __crc32cd(crc1, d64); \ 35 std::memcpy(&d64, (P) + SEGMENTBYTES * 2 + (IND) * 8, sizeof(d64)); \ 36 crc2 = __crc32cd(crc2, d64); \ 37 std::memcpy(&d64, (P) + SEGMENTBYTES * 3 + (IND) * 8, sizeof(d64)); \ 38 crc3 = __crc32cd(crc3, d64); \ 39 std::memcpy(&d64, (P) + SEGMENTBYTES * 0 + (IND) * 8, sizeof(d64)); \ 40 crc0 = __crc32cd(crc0, d64); \ 41 } while (0); 42 43 // compute 8*8 bytes for each segment parallelly 44 #define CRC32C256BYTES(P, IND) \ 45 do { \ 46 CRC32C32BYTES((P), (IND)*8 + 0) \ 47 CRC32C32BYTES((P), (IND)*8 + 1) \ 48 CRC32C32BYTES((P), (IND)*8 + 2) \ 49 CRC32C32BYTES((P), (IND)*8 + 3) \ 50 CRC32C32BYTES((P), (IND)*8 + 4) \ 51 CRC32C32BYTES((P), (IND)*8 + 5) \ 52 CRC32C32BYTES((P), (IND)*8 + 6) \ 53 CRC32C32BYTES((P), (IND)*8 + 7) \ 54 } while (0); 55 56 // compute 4*8*8 bytes for each segment parallelly 57 #define CRC32C1024BYTES(P) \ 58 do { \ 59 CRC32C256BYTES((P), 0) \ 60 CRC32C256BYTES((P), 1) \ 61 CRC32C256BYTES((P), 2) \ 62 CRC32C256BYTES((P), 3) \ 63 (P) += 4 * SEGMENTBYTES; \ 64 } while (0) 65 66 namespace crc32c { 67 68 uint32_t ExtendArm64(uint32_t crc, const uint8_t *data, size_t size) { 69 int64_t length = size; 70 uint32_t crc0, crc1, crc2, crc3; 71 uint64_t t0, t1, t2; 72 uint16_t d16; 73 uint32_t d32; 74 uint64_t d64; 75 76 // k0=CRC(x^(3*SEGMENTBYTES*8)), k1=CRC(x^(2*SEGMENTBYTES*8)), 77 // k2=CRC(x^(SEGMENTBYTES*8)) 78 const poly64_t k0 = 0x8d96551c, k1 = 0xbd6f81f8, k2 = 0xdcb17aa4; 79 80 crc = crc ^ kCRC32Xor; 81 82 while (length >= KBYTES) { 83 crc0 = crc; 84 crc1 = 0; 85 crc2 = 0; 86 crc3 = 0; 87 88 // Process 1024 bytes in parallel. 89 CRC32C1024BYTES(data); 90 91 // Merge the 4 partial CRC32C values. 92 t2 = (uint64_t)vmull_p64(crc2, k2); 93 t1 = (uint64_t)vmull_p64(crc1, k1); 94 t0 = (uint64_t)vmull_p64(crc0, k0); 95 std::memcpy(&d64, data, sizeof(d64)); 96 crc = __crc32cd(crc3, d64); 97 data += sizeof(uint64_t); 98 crc ^= __crc32cd(0, t2); 99 crc ^= __crc32cd(0, t1); 100 crc ^= __crc32cd(0, t0); 101 102 length -= KBYTES; 103 } 104 105 while (length >= 8) { 106 std::memcpy(&d64, data, sizeof(d64)); 107 crc = __crc32cd(crc, d64); 108 data += 8; 109 length -= 8; 110 } 111 112 if (length & 4) { 113 std::memcpy(&d32, data, sizeof(d32)); 114 crc = __crc32cw(crc, d32); 115 data += 4; 116 } 117 118 if (length & 2) { 119 std::memcpy(&d16, data, sizeof(d16)); 120 crc = __crc32ch(crc, d16); 121 data += 2; 122 } 123 124 if (length & 1) { 125 crc = __crc32cb(crc, *data); 126 } 127 128 return crc ^ kCRC32Xor; 129 } 130 131 } // namespace crc32c 132 133 #endif // HAVE_ARM64_CRC32C