mirror of
https://github.com/shadps4-emu/ext-zlib-ng.git
synced 2024-10-07 00:13:58 +00:00
Added support for unaligned/intrinsic to deflate_rle.
This commit is contained in:
parent
d7ba81940e
commit
80ba8b6e19
@ -979,6 +979,7 @@ set(ZLIB_PUBLIC_HDRS
|
||||
set(ZLIB_PRIVATE_HDRS
|
||||
adler32_p.h
|
||||
chunkset_tpl.h
|
||||
compare256_rle.h
|
||||
cpu_features.h
|
||||
crc32_braid_p.h
|
||||
crc32_braid_comb_p.h
|
||||
|
134
compare256_rle.h
Normal file
134
compare256_rle.h
Normal file
@ -0,0 +1,134 @@
|
||||
/* compare256_rle.h -- 256 byte run-length encoding comparison
|
||||
* Copyright (C) 2022 Nathan Moinvaziri
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1);
|
||||
|
||||
/* ALIGNED, byte comparison */
|
||||
static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
#ifdef UNALIGNED_OK
|
||||
/* 16-bit unaligned integer comparison */
|
||||
static inline uint32_t compare256_rle_unaligned_16(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
uint16_t src0_cmp, src1_cmp;
|
||||
|
||||
memcpy(&src0_cmp, src0, sizeof(src0_cmp));
|
||||
|
||||
do {
|
||||
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
|
||||
if (src0_cmp != src1_cmp)
|
||||
return len + (*src0 == *src1);
|
||||
src1 += 2, len += 2;
|
||||
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
|
||||
if (src0_cmp != src1_cmp)
|
||||
return len + (*src0 == *src1);
|
||||
src1 += 2, len += 2;
|
||||
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
|
||||
if (src0_cmp != src1_cmp)
|
||||
return len + (*src0 == *src1);
|
||||
src1 += 2, len += 2;
|
||||
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
|
||||
if (src0_cmp != src1_cmp)
|
||||
return len + (*src0 == *src1);
|
||||
src1 += 2, len += 2;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
#ifdef HAVE_BUILTIN_CTZ
|
||||
/* 32-bit unaligned integer comparison */
|
||||
static inline uint32_t compare256_rle_unaligned_32(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t sv, len = 0;
|
||||
uint16_t src0_cmp;
|
||||
|
||||
memcpy(&src0_cmp, src0, sizeof(src0_cmp));
|
||||
sv = ((uint32_t)src0_cmp << 16) | src0_cmp;
|
||||
|
||||
do {
|
||||
uint32_t mv, diff;
|
||||
|
||||
memcpy(&mv, src1, sizeof(mv));
|
||||
|
||||
diff = sv ^ mv;
|
||||
if (diff) {
|
||||
uint32_t match_byte = __builtin_ctz(diff) / 8;
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src1 += 4, len += 4;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
|
||||
/* 64-bit unaligned integer comparison */
|
||||
static inline uint32_t compare256_rle_unaligned_64(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t src0_cmp32, len = 0;
|
||||
uint16_t src0_cmp;
|
||||
uint64_t sv;
|
||||
|
||||
memcpy(&src0_cmp, src0, sizeof(src0_cmp));
|
||||
src0_cmp32 = ((uint32_t)src0_cmp << 16) | src0_cmp;
|
||||
sv = ((uint64_t)src0_cmp32 << 32) | src0_cmp32;
|
||||
|
||||
do {
|
||||
uint64_t mv, diff;
|
||||
|
||||
memcpy(&mv, src1, sizeof(mv));
|
||||
|
||||
diff = sv ^ mv;
|
||||
if (diff) {
|
||||
uint64_t match_byte = __builtin_ctzll(diff) / 8;
|
||||
return len + (uint32_t)match_byte;
|
||||
}
|
||||
|
||||
src1 += 8, len += 8;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -5,10 +5,23 @@
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "compare256_rle.h"
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
|
||||
#ifdef UNALIGNED_OK
|
||||
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
|
||||
# define compare256_rle compare256_rle_unaligned_64
|
||||
# elif defined(HAVE_BUILTIN_CTZ)
|
||||
# define compare256_rle compare256_rle_unaligned_32
|
||||
# else
|
||||
# define compare256_rle compare256_rle_unaligned_16
|
||||
# endif
|
||||
#else
|
||||
# define compare256_rle compare256_rle_c
|
||||
#endif
|
||||
|
||||
/* ===========================================================================
|
||||
* For Z_RLE, simply look for runs of bytes, generate matches only of distance
|
||||
* one. Do not maintain a hash table. (It will be regenerated if this run of
|
||||
@ -16,8 +29,7 @@
|
||||
*/
|
||||
Z_INTERNAL block_state deflate_rle(deflate_state *s, int flush) {
|
||||
int bflush = 0; /* set if current block must be flushed */
|
||||
unsigned int prev; /* byte at distance one to match */
|
||||
unsigned char *scan, *strend; /* scan goes up to strend for length of run */
|
||||
unsigned char *scan; /* scan goes up to strend for length of run */
|
||||
uint32_t match_len = 0;
|
||||
|
||||
for (;;) {
|
||||
@ -36,20 +48,12 @@ Z_INTERNAL block_state deflate_rle(deflate_state *s, int flush) {
|
||||
/* See how many times the previous byte repeats */
|
||||
if (s->lookahead >= STD_MIN_MATCH && s->strstart > 0) {
|
||||
scan = s->window + s->strstart - 1;
|
||||
prev = *scan;
|
||||
if (prev == *++scan && prev == *++scan && prev == *++scan) {
|
||||
strend = s->window + s->strstart + STD_MAX_MATCH;
|
||||
do {
|
||||
} while (prev == *++scan && prev == *++scan &&
|
||||
prev == *++scan && prev == *++scan &&
|
||||
prev == *++scan && prev == *++scan &&
|
||||
prev == *++scan && prev == *++scan &&
|
||||
scan < strend);
|
||||
match_len = STD_MAX_MATCH - (unsigned int)(strend - scan);
|
||||
if (scan[0] == scan[1] && scan[1] == scan[2]) {
|
||||
match_len = compare256_rle(scan, scan+3)+2;
|
||||
match_len = MIN(match_len, s->lookahead);
|
||||
match_len = MIN(match_len, STD_MAX_MATCH);
|
||||
}
|
||||
Assert(scan <= s->window + s->window_size - 1, "wild scan");
|
||||
Assert(scan+match_len <= s->window + s->window_size - 1, "wild scan");
|
||||
}
|
||||
|
||||
/* Emit match if have run of STD_MIN_MATCH or longer, else emit literal */
|
||||
|
@ -5,6 +5,7 @@ include(FetchContent)
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS ON)
|
||||
|
||||
enable_language(CXX)
|
||||
|
||||
# Search for Google benchmark package
|
||||
@ -36,6 +37,7 @@ add_executable(benchmark_zlib
|
||||
benchmark_adler32.cc
|
||||
benchmark_adler32_copy.cc
|
||||
benchmark_compare256.cc
|
||||
benchmark_compare256_rle.cc
|
||||
benchmark_crc32.cc
|
||||
benchmark_main.cc
|
||||
benchmark_slidehash.cc
|
||||
|
Loading…
Reference in New Issue
Block a user