Cradicle Explorer

ipchksum.c
  1  /* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
  2  
  3  #include <commonlib/bsd/helpers.h>
  4  #include <commonlib/bsd/ipchksum.h>
  5  
  6  /* See RFC 1071 for mathematical explanations of why we can first sum in a larger register and
  7     then narrow down, why we don't need to worry about endianness, etc. */
  8  uint16_t ipchksum(const void *data, size_t size)
  9  {
 10  	const uint8_t *p1 = data;
 11  	unsigned long wide_sum = 0;
 12  	uint32_t sum = 0;
 13  	size_t i = 0;
 14  
 15  #if defined(__aarch64__)
 16  	size_t size16 = size / 16;
 17  	const uint64_t *p8 = data;
 18  	if (size16) {
 19  		unsigned long tmp1, tmp2;
 20  		i = size16 * 16;
 21  		asm (
 22  			"adds	xzr, xzr, xzr\n\t"	/* init carry flag for addition */
 23  			"1:\n\t"
 24  			"ldp	%[v1], %[v2], [%[p8]], #16\n\t"
 25  			"adcs	%[wsum], %[wsum], %[v1]\n\t"
 26  			"adcs	%[wsum], %[wsum], %[v2]\n\t"
 27  			"sub	%[size16], %[size16], #1\n\t"
 28  			"cbnz	%[size16], 1b\n\t"
 29  			"adcs	%[wsum], %[wsum], xzr\n\t"	/* use up last carry */
 30  		: [v1] "=r" (tmp1),
 31  		  [v2] "=r" (tmp2),
 32  		  [wsum] "+r" (wide_sum),
 33  		  [p8] "+r" (p8),
 34  		  [size16] "+r" (size16)
 35  		:: "cc"
 36  		);
 37  	}
 38  #elif defined(__i386__) || defined(__x86_64__)	/* __aarch64__ */
 39  	size_t size8 = size / 8;
 40  	const uint64_t *p8 = data;
 41  	i = size8 * 8;
 42  	asm (
 43  		"clc\n\t"
 44  		"1:\n\t"
 45  		"jecxz	2f\n\t"		/* technically RCX on 64, but not gonna be that big */
 46  		"adc	(%[p8]), %[wsum]\n\t"
 47  #if defined(__i386__)
 48  		"adc	4(%[p8]), %[wsum]\n\t"
 49  #endif	/* __i386__ */
 50  		"lea	-1(%[size8]), %[size8]\n\t"	/* Use LEA as a makeshift ADD that */
 51  		"lea	8(%[p8]), %[p8]\n\t"		/* doesn't modify the carry flag. */
 52  		"jmp	1b\n\t"
 53  		"2:\n\t"
 54  		"setc	%b[size8]\n\t"	/* reuse size register to save last carry */
 55  		"add	%[size8], %[wsum]\n\t"
 56  	: [wsum] "+r" (wide_sum),
 57  	  [p8] "+r" (p8),
 58  	  [size8] "+c" (size8)		/* put size in ECX so we can JECXZ */
 59  	:: "cc"
 60  	);
 61  #else	/* __i386__ || __x86_64__ */
 62  	size_t aligned_size = ALIGN_DOWN(size, sizeof(unsigned long));
 63  	const unsigned long *p_long = data;
 64  	for (; i < aligned_size; i += sizeof(unsigned long)) {
 65  		unsigned long new_sum = wide_sum + *p_long++;
 66  		/* Overflow check to emulate a manual "add with carry" in C. The compiler seems
 67  		   to be clever enough to find ways to elide the branch on most archs. */
 68  		if (new_sum < wide_sum)
 69  			new_sum++;
 70  		wide_sum = new_sum;
 71  	}
 72  #endif
 73  
 74  	while (wide_sum) {
 75  		sum += wide_sum & 0xFFFF;
 76  		wide_sum >>= 16;
 77  	}
 78  	sum = (sum & 0xFFFF) + (sum >> 16);
 79  
 80  	for (; i < size; i++) {
 81  		uint32_t v = p1[i];
 82  		if (i % 2)
 83  			v <<= 8;
 84  		sum += v;
 85  
 86  		/* Doing this unconditionally seems to be faster. */
 87  		sum = (sum & 0xFFFF) + (sum >> 16);
 88  	}
 89  
 90  	return (uint16_t)~sum;
 91  }
 92  
 93  uint16_t ipchksum_add(size_t offset, uint16_t first, uint16_t second)
 94  {
 95  	first = ~first;
 96  	second = ~second;
 97  
 98  	/*
 99  	 * Since the checksum is calculated in 16-bit chunks, if the offset at which
100  	 * the data covered by the second checksum would start (if both data streams
101  	 * came one after the other) is odd, that means the second stream starts in
102  	 * the middle of a 16-bit chunk. This means the second checksum is byte
103  	 * swapped compared to what we need it to be, and we must swap it back.
104  	 */
105  	if (offset % 2)
106  		second = (second >> 8) | (second << 8);
107  
108  	uint32_t sum = first + second;
109  	sum = (sum & 0xFFFF) + (sum >> 16);
110  
111  	return (uint16_t)~sum;
112  }