ipchksum.c
1 /* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ 2 3 #include <commonlib/bsd/helpers.h> 4 #include <commonlib/bsd/ipchksum.h> 5 6 /* See RFC 1071 for mathematical explanations of why we can first sum in a larger register and 7 then narrow down, why we don't need to worry about endianness, etc. */ 8 uint16_t ipchksum(const void *data, size_t size) 9 { 10 const uint8_t *p1 = data; 11 unsigned long wide_sum = 0; 12 uint32_t sum = 0; 13 size_t i = 0; 14 15 #if defined(__aarch64__) 16 size_t size16 = size / 16; 17 const uint64_t *p8 = data; 18 if (size16) { 19 unsigned long tmp1, tmp2; 20 i = size16 * 16; 21 asm ( 22 "adds xzr, xzr, xzr\n\t" /* init carry flag for addition */ 23 "1:\n\t" 24 "ldp %[v1], %[v2], [%[p8]], #16\n\t" 25 "adcs %[wsum], %[wsum], %[v1]\n\t" 26 "adcs %[wsum], %[wsum], %[v2]\n\t" 27 "sub %[size16], %[size16], #1\n\t" 28 "cbnz %[size16], 1b\n\t" 29 "adcs %[wsum], %[wsum], xzr\n\t" /* use up last carry */ 30 : [v1] "=r" (tmp1), 31 [v2] "=r" (tmp2), 32 [wsum] "+r" (wide_sum), 33 [p8] "+r" (p8), 34 [size16] "+r" (size16) 35 :: "cc" 36 ); 37 } 38 #elif defined(__i386__) || defined(__x86_64__) /* __aarch64__ */ 39 size_t size8 = size / 8; 40 const uint64_t *p8 = data; 41 i = size8 * 8; 42 asm ( 43 "clc\n\t" 44 "1:\n\t" 45 "jecxz 2f\n\t" /* technically RCX on 64, but not gonna be that big */ 46 "adc (%[p8]), %[wsum]\n\t" 47 #if defined(__i386__) 48 "adc 4(%[p8]), %[wsum]\n\t" 49 #endif /* __i386__ */ 50 "lea -1(%[size8]), %[size8]\n\t" /* Use LEA as a makeshift ADD that */ 51 "lea 8(%[p8]), %[p8]\n\t" /* doesn't modify the carry flag. */ 52 "jmp 1b\n\t" 53 "2:\n\t" 54 "setc %b[size8]\n\t" /* reuse size register to save last carry */ 55 "add %[size8], %[wsum]\n\t" 56 : [wsum] "+r" (wide_sum), 57 [p8] "+r" (p8), 58 [size8] "+c" (size8) /* put size in ECX so we can JECXZ */ 59 :: "cc" 60 ); 61 #else /* __i386__ || __x86_64__ */ 62 size_t aligned_size = ALIGN_DOWN(size, sizeof(unsigned long)); 63 const unsigned long *p_long = data; 64 for (; i < aligned_size; i += sizeof(unsigned long)) { 65 unsigned long new_sum = wide_sum + *p_long++; 66 /* Overflow check to emulate a manual "add with carry" in C. The compiler seems 67 to be clever enough to find ways to elide the branch on most archs. */ 68 if (new_sum < wide_sum) 69 new_sum++; 70 wide_sum = new_sum; 71 } 72 #endif 73 74 while (wide_sum) { 75 sum += wide_sum & 0xFFFF; 76 wide_sum >>= 16; 77 } 78 sum = (sum & 0xFFFF) + (sum >> 16); 79 80 for (; i < size; i++) { 81 uint32_t v = p1[i]; 82 if (i % 2) 83 v <<= 8; 84 sum += v; 85 86 /* Doing this unconditionally seems to be faster. */ 87 sum = (sum & 0xFFFF) + (sum >> 16); 88 } 89 90 return (uint16_t)~sum; 91 } 92 93 uint16_t ipchksum_add(size_t offset, uint16_t first, uint16_t second) 94 { 95 first = ~first; 96 second = ~second; 97 98 /* 99 * Since the checksum is calculated in 16-bit chunks, if the offset at which 100 * the data covered by the second checksum would start (if both data streams 101 * came one after the other) is odd, that means the second stream starts in 102 * the middle of a 16-bit chunk. This means the second checksum is byte 103 * swapped compared to what we need it to be, and we must swap it back. 104 */ 105 if (offset % 2) 106 second = (second >> 8) | (second << 8); 107 108 uint32_t sum = first + second; 109 sum = (sum & 0xFFFF) + (sum >> 16); 110 111 return (uint16_t)~sum; 112 }