mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-01-09 01:29:52 +00:00
0928368f62
This adds the Arm Optimized Routines (see https://github.com/ARM-software/optimized-routines) source code under the the LLVM license. The version of the code provided in this patch is v20.02 of the Arm Optimized Routines project. This entire contribution is being committed as is even though it does not currently fit the LLVM libc model and does not follow the LLVM coding style. In the near future, implementations from this patch will be moved over to their right place in the LLVM-libc tree. This will be done over many small patches, all of which will go through the normal LLVM code review process. See this libc-dev post for the plan: http://lists.llvm.org/pipermail/libc-dev/2020-March/000044.html Differential revision of the original upload: https://reviews.llvm.org/D75355
83 lines
1.8 KiB
C
83 lines
1.8 KiB
C
/*
|
|
* Compute 16-bit sum in ones' complement arithmetic (with end-around carry).
|
|
* This sum is often used as a simple checksum in networking.
|
|
*
|
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
* See https://llvm.org/LICENSE.txt for license information.
|
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
*/
|
|
|
|
#include "networking.h"
|
|
#include "chksum_common.h"
|
|
|
|
always_inline
|
|
static inline uint32_t
|
|
slurp_head32(const void **pptr, uint32_t *nbytes)
|
|
{
|
|
uint32_t sum = 0;
|
|
Assert(*nbytes >= 4);
|
|
uint32_t off = (uintptr_t) *pptr % 4;
|
|
if (likely(off != 0))
|
|
{
|
|
/* Get rid of bytes 0..off-1 */
|
|
const unsigned char *ptr32 = align_ptr(*pptr, 4);
|
|
uint32_t mask = ~0U << (CHAR_BIT * off);
|
|
sum = load32(ptr32) & mask;
|
|
*pptr = ptr32 + 4;
|
|
*nbytes -= 4 - off;
|
|
}
|
|
return sum;
|
|
}
|
|
|
|
/* Additional loop unrolling would help when not auto-vectorizing */
|
|
unsigned short
|
|
__chksum(const void *ptr, unsigned int nbytes)
|
|
{
|
|
bool swap = false;
|
|
uint64_t sum = 0;
|
|
|
|
if (nbytes > 300)
|
|
{
|
|
/* 4-byte align pointer */
|
|
swap = (uintptr_t) ptr & 1;
|
|
sum = slurp_head32(&ptr, &nbytes);
|
|
}
|
|
/* Else benefit of aligning not worth the overhead */
|
|
|
|
/* Sum all 16-byte chunks */
|
|
const char *cptr = ptr;
|
|
for (uint32_t nquads = nbytes / 16; nquads != 0; nquads--)
|
|
{
|
|
uint64_t h0 = load32(cptr + 0);
|
|
uint64_t h1 = load32(cptr + 4);
|
|
uint64_t h2 = load32(cptr + 8);
|
|
uint64_t h3 = load32(cptr + 12);
|
|
sum += h0 + h1 + h2 + h3;
|
|
cptr += 16;
|
|
}
|
|
nbytes %= 16;
|
|
Assert(nbytes < 16);
|
|
|
|
/* Handle any trailing 4-byte chunks */
|
|
while (nbytes >= 4)
|
|
{
|
|
sum += load32(cptr);
|
|
cptr += 4;
|
|
nbytes -= 4;
|
|
}
|
|
Assert(nbytes < 4);
|
|
|
|
if (nbytes & 2)
|
|
{
|
|
sum += load16(cptr);
|
|
cptr += 2;
|
|
}
|
|
|
|
if (nbytes & 1)
|
|
{
|
|
sum += *(uint8_t *)cptr;
|
|
}
|
|
|
|
return fold_and_swap(sum, swap);
|
|
}
|