llvm-capstone/libc/AOR_v20.02/networking/chksum.c
Kristof Beyls 0928368f62 [libc] Provide Arm Optimized Routines for the LLVM libc project.
This adds the Arm Optimized Routines (see
https://github.com/ARM-software/optimized-routines) source code under the
the LLVM license. The version of the code provided in this patch is v20.02
of the Arm Optimized Routines project.

This entire contribution is being committed as is even though it does
not currently fit the LLVM libc model and does not follow the LLVM
coding style. In the near future, implementations from this patch will be
moved over to their right place in the LLVM-libc tree. This will be done
over many small patches, all of which will go through the normal LLVM code
review process. See this libc-dev post for the plan:
http://lists.llvm.org/pipermail/libc-dev/2020-March/000044.html

Differential revision of the original upload: https://reviews.llvm.org/D75355
2020-03-16 12:19:31 -07:00

83 lines
1.8 KiB
C

/*
* Compute 16-bit sum in ones' complement arithmetic (with end-around carry).
* This sum is often used as a simple checksum in networking.
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "networking.h"
#include "chksum_common.h"
always_inline
static inline uint32_t
slurp_head32(const void **pptr, uint32_t *nbytes)
{
uint32_t sum = 0;
Assert(*nbytes >= 4);
uint32_t off = (uintptr_t) *pptr % 4;
if (likely(off != 0))
{
/* Get rid of bytes 0..off-1 */
const unsigned char *ptr32 = align_ptr(*pptr, 4);
uint32_t mask = ~0U << (CHAR_BIT * off);
sum = load32(ptr32) & mask;
*pptr = ptr32 + 4;
*nbytes -= 4 - off;
}
return sum;
}
/* Additional loop unrolling would help when not auto-vectorizing */
unsigned short
__chksum(const void *ptr, unsigned int nbytes)
{
bool swap = false;
uint64_t sum = 0;
if (nbytes > 300)
{
/* 4-byte align pointer */
swap = (uintptr_t) ptr & 1;
sum = slurp_head32(&ptr, &nbytes);
}
/* Else benefit of aligning not worth the overhead */
/* Sum all 16-byte chunks */
const char *cptr = ptr;
for (uint32_t nquads = nbytes / 16; nquads != 0; nquads--)
{
uint64_t h0 = load32(cptr + 0);
uint64_t h1 = load32(cptr + 4);
uint64_t h2 = load32(cptr + 8);
uint64_t h3 = load32(cptr + 12);
sum += h0 + h1 + h2 + h3;
cptr += 16;
}
nbytes %= 16;
Assert(nbytes < 16);
/* Handle any trailing 4-byte chunks */
while (nbytes >= 4)
{
sum += load32(cptr);
cptr += 4;
nbytes -= 4;
}
Assert(nbytes < 4);
if (nbytes & 2)
{
sum += load16(cptr);
cptr += 2;
}
if (nbytes & 1)
{
sum += *(uint8_t *)cptr;
}
return fold_and_swap(sum, swap);
}