Added support for unaligned/intrinsic to deflate_rle.

This commit is contained in:
Nathan Moinvaziri 2023-02-05 10:30:18 -08:00 committed by Hans Kristian Rosbach
parent d7ba81940e
commit 80ba8b6e19
4 changed files with 154 additions and 13 deletions

View File

@ -979,6 +979,7 @@ set(ZLIB_PUBLIC_HDRS
set(ZLIB_PRIVATE_HDRS
adler32_p.h
chunkset_tpl.h
compare256_rle.h
cpu_features.h
crc32_braid_p.h
crc32_braid_comb_p.h

134
compare256_rle.h Normal file
View File

@ -0,0 +1,134 @@
/* compare256_rle.h -- 256 byte run-length encoding comparison
* Copyright (C) 2022 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "fallback_builtins.h"
typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1);
/* ALIGNED, byte comparison */
static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
} while (len < 256);
return 256;
}
#ifdef UNALIGNED_OK
/* 16-bit unaligned integer comparison */
static inline uint32_t compare256_rle_unaligned_16(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
uint16_t src0_cmp, src1_cmp;
memcpy(&src0_cmp, src0, sizeof(src0_cmp));
do {
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
if (src0_cmp != src1_cmp)
return len + (*src0 == *src1);
src1 += 2, len += 2;
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
if (src0_cmp != src1_cmp)
return len + (*src0 == *src1);
src1 += 2, len += 2;
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
if (src0_cmp != src1_cmp)
return len + (*src0 == *src1);
src1 += 2, len += 2;
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
if (src0_cmp != src1_cmp)
return len + (*src0 == *src1);
src1 += 2, len += 2;
} while (len < 256);
return 256;
}
#ifdef HAVE_BUILTIN_CTZ
/* 32-bit unaligned integer comparison */
static inline uint32_t compare256_rle_unaligned_32(const uint8_t *src0, const uint8_t *src1) {
uint32_t sv, len = 0;
uint16_t src0_cmp;
memcpy(&src0_cmp, src0, sizeof(src0_cmp));
sv = ((uint32_t)src0_cmp << 16) | src0_cmp;
do {
uint32_t mv, diff;
memcpy(&mv, src1, sizeof(mv));
diff = sv ^ mv;
if (diff) {
uint32_t match_byte = __builtin_ctz(diff) / 8;
return len + match_byte;
}
src1 += 4, len += 4;
} while (len < 256);
return 256;
}
#endif
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
/* 64-bit unaligned integer comparison */
static inline uint32_t compare256_rle_unaligned_64(const uint8_t *src0, const uint8_t *src1) {
uint32_t src0_cmp32, len = 0;
uint16_t src0_cmp;
uint64_t sv;
memcpy(&src0_cmp, src0, sizeof(src0_cmp));
src0_cmp32 = ((uint32_t)src0_cmp << 16) | src0_cmp;
sv = ((uint64_t)src0_cmp32 << 32) | src0_cmp32;
do {
uint64_t mv, diff;
memcpy(&mv, src1, sizeof(mv));
diff = sv ^ mv;
if (diff) {
uint64_t match_byte = __builtin_ctzll(diff) / 8;
return len + (uint32_t)match_byte;
}
src1 += 8, len += 8;
} while (len < 256);
return 256;
}
#endif
#endif

View File

@ -5,10 +5,23 @@
*/
#include "zbuild.h"
#include "compare256_rle.h"
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
#ifdef UNALIGNED_OK
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
# define compare256_rle compare256_rle_unaligned_64
# elif defined(HAVE_BUILTIN_CTZ)
# define compare256_rle compare256_rle_unaligned_32
# else
# define compare256_rle compare256_rle_unaligned_16
# endif
#else
# define compare256_rle compare256_rle_c
#endif
/* ===========================================================================
* For Z_RLE, simply look for runs of bytes, generate matches only of distance
* one. Do not maintain a hash table. (It will be regenerated if this run of
@ -16,8 +29,7 @@
*/
Z_INTERNAL block_state deflate_rle(deflate_state *s, int flush) {
int bflush = 0; /* set if current block must be flushed */
unsigned int prev; /* byte at distance one to match */
unsigned char *scan, *strend; /* scan goes up to strend for length of run */
unsigned char *scan; /* scan goes up to strend for length of run */
uint32_t match_len = 0;
for (;;) {
@ -36,20 +48,12 @@ Z_INTERNAL block_state deflate_rle(deflate_state *s, int flush) {
/* See how many times the previous byte repeats */
if (s->lookahead >= STD_MIN_MATCH && s->strstart > 0) {
scan = s->window + s->strstart - 1;
prev = *scan;
if (prev == *++scan && prev == *++scan && prev == *++scan) {
strend = s->window + s->strstart + STD_MAX_MATCH;
do {
} while (prev == *++scan && prev == *++scan &&
prev == *++scan && prev == *++scan &&
prev == *++scan && prev == *++scan &&
prev == *++scan && prev == *++scan &&
scan < strend);
match_len = STD_MAX_MATCH - (unsigned int)(strend - scan);
if (scan[0] == scan[1] && scan[1] == scan[2]) {
match_len = compare256_rle(scan, scan+3)+2;
match_len = MIN(match_len, s->lookahead);
match_len = MIN(match_len, STD_MAX_MATCH);
}
Assert(scan <= s->window + s->window_size - 1, "wild scan");
Assert(scan+match_len <= s->window + s->window_size - 1, "wild scan");
}
/* Emit match if have run of STD_MIN_MATCH or longer, else emit literal */

View File

@ -5,6 +5,7 @@ include(FetchContent)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)
enable_language(CXX)
# Search for Google benchmark package
@ -36,6 +37,7 @@ add_executable(benchmark_zlib
benchmark_adler32.cc
benchmark_adler32_copy.cc
benchmark_compare256.cc
benchmark_compare256_rle.cc
benchmark_crc32.cc
benchmark_main.cc
benchmark_slidehash.cc