Update ext/jpge

From aeb7d3b463

Fixes #14012

Also now comes with SSE2 SIMD
This commit is contained in:
Gleb Mazovetskiy 2021-01-30 11:39:01 +00:00
parent c251d69eab
commit 5a09bc3d30
7 changed files with 5097 additions and 4347 deletions

25
ext/jpge/README.md Normal file
View File

@ -0,0 +1,25 @@
# jpeg-compressor
This directory contains code from
https://github.com/richgel999/jpeg-compressor/tree/aeb7d3b463aa8228b87a28013c15ee50a7e6fcf3
with the following patches:
* https://github.com/richgel999/jpeg-compressor/pull/18
## License
The license of jpgd.cpp/.h and jpge.cpp/.h is either Public Domain or Apache 2.0. Choose whatever you want.
The license for the optional file jpgd_idct.h (and ONLY this file) is Copyright 2009 Intel Corporation:
Permission is granted to use, copy, distribute and prepare derivative works of
this software for any purpose and without fee, provided, that the above
copyright notice and this statement appear in all copies. Intel makes no
representations about the suitability of this software for any purpose. THIS
SOFTWARE IS PROVIDED "AS IS." INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES,
EXPRESS OR IMPLIED, AND ALL LIABILITY, INCLUDING CONSEQUENTIAL AND OTHER
INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, INCLUDING LIABILITY FOR
INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not assume any
responsibility for any errors which may appear in this software nor any
responsibility to update it.

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +1,14 @@
// jpgd.h - C++ class for JPEG decompression.
// Public domain, Rich Geldreich <richgel99@gmail.com>
// Richard Geldreich <richgel99@gmail.com>
// See jpgd.cpp for license (Public Domain or Apache 2.0).
#ifndef JPEG_DECODER_H
#define JPEG_DECODER_H
#include <stdlib.h>
#include <stdio.h>
#include <setjmp.h>
#include <assert.h>
#include <stdint.h>
#ifdef _MSC_VER
#define JPGD_NORETURN __declspec(noreturn)
@ -15,6 +18,9 @@
#define JPGD_NORETURN
#endif
#define JPGD_HUFF_TREE_MAX_LENGTH 512
#define JPGD_HUFF_CODE_SIZE_MAX_LENGTH 256
namespace jpgd
{
typedef unsigned char uint8;
@ -28,8 +34,8 @@ namespace jpgd
// On return, width/height will be set to the image's dimensions, and actual_comps will be set to the either 1 (grayscale) or 3 (RGB).
// Notes: For more control over where and how the source data is read, see the decompress_jpeg_image_from_stream() function below, or call the jpeg_decoder class directly.
// Requesting a 8 or 32bpp image is currently a little faster than 24bpp because the jpeg_decoder class itself currently always unpacks to either 8 or 32bpp.
unsigned char *decompress_jpeg_image_from_memory(const unsigned char *pSrc_data, int src_data_size, int *width, int *height, int *actual_comps, int req_comps);
unsigned char *decompress_jpeg_image_from_file(const char *pSrc_filename, int *width, int *height, int *actual_comps, int req_comps);
unsigned char* decompress_jpeg_image_from_memory(const unsigned char* pSrc_data, int src_data_size, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
unsigned char* decompress_jpeg_image_from_file(const char* pSrc_filename, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
// Success/failure error codes.
enum jpgd_status
@ -41,8 +47,8 @@ namespace jpgd
JPGD_BAD_SOS_COMP_ID, JPGD_W_EXTRA_BYTES_BEFORE_MARKER, JPGD_NO_ARITHMITIC_SUPPORT, JPGD_UNEXPECTED_MARKER,
JPGD_NOT_JPEG, JPGD_UNSUPPORTED_MARKER, JPGD_BAD_DQT_LENGTH, JPGD_TOO_MANY_BLOCKS,
JPGD_UNDEFINED_QUANT_TABLE, JPGD_UNDEFINED_HUFF_TABLE, JPGD_NOT_SINGLE_SCAN, JPGD_UNSUPPORTED_COLORSPACE,
JPGD_UNSUPPORTED_SAMP_FACTORS, JPGD_DECODE_ERROR, JPGD_BAD_RESTART_MARKER, JPGD_ASSERTION_ERROR,
JPGD_BAD_SOS_SPECTRAL, JPGD_BAD_SOS_SUCCESSIVE, JPGD_STREAM_READ, JPGD_NOTENOUGHMEM
JPGD_UNSUPPORTED_SAMP_FACTORS, JPGD_DECODE_ERROR, JPGD_BAD_RESTART_MARKER,
JPGD_BAD_SOS_SPECTRAL, JPGD_BAD_SOS_SUCCESSIVE, JPGD_STREAM_READ, JPGD_NOTENOUGHMEM, JPGD_TOO_MANY_SCANS
};
// Input stream interface.
@ -104,28 +110,35 @@ namespace jpgd
};
// Loads JPEG file from a jpeg_decoder_stream.
unsigned char *decompress_jpeg_image_from_stream(jpeg_decoder_stream *pStream, int *width, int *height, int *actual_comps, int req_comps);
unsigned char* decompress_jpeg_image_from_stream(jpeg_decoder_stream* pStream, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
enum
{
JPGD_IN_BUF_SIZE = 8192, JPGD_MAX_BLOCKS_PER_MCU = 10, JPGD_MAX_HUFF_TABLES = 8, JPGD_MAX_QUANT_TABLES = 4,
JPGD_MAX_COMPONENTS = 4, JPGD_MAX_COMPS_IN_SCAN = 4, JPGD_MAX_BLOCKS_PER_ROW = 8192, JPGD_MAX_HEIGHT = 16384, JPGD_MAX_WIDTH = 16384
JPGD_MAX_COMPONENTS = 4, JPGD_MAX_COMPS_IN_SCAN = 4, JPGD_MAX_BLOCKS_PER_ROW = 16384, JPGD_MAX_HEIGHT = 32768, JPGD_MAX_WIDTH = 32768
};
typedef int16 jpgd_quant_t;
typedef int16 jpgd_block_t;
typedef int16 jpgd_block_coeff_t;
class jpeg_decoder
{
public:
enum
{
cFlagBoxChromaFiltering = 1,
cFlagDisableSIMD = 2
};
// Call get_error_code() after constructing to determine if the stream is valid or not. You may call the get_width(), get_height(), etc.
// methods after the constructor is called. You may then either destruct the object, or begin decoding the image by calling begin_decoding(), then decode() on each scanline.
jpeg_decoder(jpeg_decoder_stream *pStream);
jpeg_decoder(jpeg_decoder_stream* pStream, uint32_t flags = 0);
~jpeg_decoder();
// Call this method after constructing the object to begin decompression.
// If JPGD_SUCCESS is returned you may then call decode() on each scanline.
int begin_decoding();
// Returns the next scan line.
@ -160,8 +173,8 @@ namespace jpgd
bool ac_table;
uint look_up[256];
uint look_up2[256];
uint8 code_size[256];
uint tree[512];
uint8 code_size[JPGD_HUFF_CODE_SIZE_MAX_LENGTH];
uint tree[JPGD_HUFF_TREE_MAX_LENGTH];
};
struct coeff_buf
@ -181,11 +194,14 @@ namespace jpgd
};
jmp_buf m_jmp_state;
uint32_t m_flags;
mem_block* m_pMem_blocks;
int m_image_x_size;
int m_image_y_size;
jpeg_decoder_stream* m_pStream;
int m_progressive_flag;
uint8 m_huff_ac[JPGD_MAX_HUFF_TABLES];
uint8* m_huff_num[JPGD_MAX_HUFF_TABLES]; // pointer to number of Huffman codes per bit size
uint8* m_huff_val[JPGD_MAX_HUFF_TABLES]; // pointer to Huffman codes per bit size
@ -214,6 +230,7 @@ namespace jpgd
int m_mcu_org[JPGD_MAX_BLOCKS_PER_MCU];
int m_total_lines_left; // total # lines left in image
int m_mcu_lines_left; // total # lines left in this MCU
int m_num_buffered_scanlines;
int m_real_dest_bytes_per_scan_line;
int m_dest_bytes_per_scan_line; // rounded up
int m_dest_bytes_per_pixel; // 4 (RGB) or 1 (Y)
@ -225,10 +242,11 @@ namespace jpgd
uint8* m_pIn_buf_ofs;
int m_in_buf_left;
int m_tem_flag;
bool m_eof_flag;
uint8 m_in_buf_pad_start[128];
uint8 m_in_buf_pad_start[64];
uint8 m_in_buf[JPGD_IN_BUF_SIZE + 128];
uint8 m_in_buf_pad_end[128];
uint8 m_in_buf_pad_end[64];
int m_bits_left;
uint m_bit_buf;
int m_restart_interval;
@ -236,15 +254,13 @@ namespace jpgd
int m_next_restart_num;
int m_max_mcus_per_row;
int m_max_blocks_per_mcu;
int m_expanded_blocks_per_mcu;
int m_expanded_blocks_per_row;
int m_expanded_blocks_per_component;
bool m_freq_domain_chroma_upsample;
int m_max_mcus_per_col;
uint m_last_dc_val[JPGD_MAX_COMPONENTS];
jpgd_block_t* m_pMCU_coefficients;
jpgd_block_coeff_t* m_pMCU_coefficients;
int m_mcu_block_max_zag[JPGD_MAX_BLOCKS_PER_MCU];
uint8* m_pSample_buf;
uint8* m_pSample_buf_prev;
int m_crr[256];
int m_cbb[256];
int m_crg[256];
@ -252,12 +268,18 @@ namespace jpgd
uint8* m_pScan_line_0;
uint8* m_pScan_line_1;
jpgd_status m_error_code;
bool m_ready_flag;
int m_total_bytes_read;
bool m_ready_flag;
bool m_eof_flag;
bool m_sample_buf_prev_valid;
bool m_has_sse2;
inline int check_sample_buf_ofs(int ofs) const { assert(ofs >= 0); assert(ofs < m_max_blocks_per_row * 64); return ofs; }
void free_all_blocks();
JPGD_NORETURN void stop_decoding(jpgd_status status);
void* alloc(size_t n, bool zero = false);
void* alloc_aligned(size_t nSize, uint32_t align = 16, bool zero = false);
void word_clear(void* p, uint16 c, uint n);
void prep_in_buffer();
void read_dht_marker();
@ -271,19 +293,18 @@ namespace jpgd
void locate_soi_marker();
void locate_sof_marker();
int locate_sos_marker();
void init(jpeg_decoder_stream * pStream);
void init(jpeg_decoder_stream* pStream, uint32_t flags);
void create_look_ups();
void fix_in_buffer();
void transform_mcu(int mcu_row);
void transform_mcu_expand(int mcu_row);
coeff_buf* coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y);
inline jpgd_block_t *coeff_buf_getp(coeff_buf *cb, int block_x, int block_y);
inline jpgd_block_coeff_t* coeff_buf_getp(coeff_buf* cb, int block_x, int block_y);
void load_next_row();
void decode_next_row();
void make_huff_table(int index, huff_tables* pH);
void check_quant_tables();
void check_huff_tables();
void calc_mcu_block_order();
bool calc_mcu_block_order();
int init_scan();
void init_frame();
void process_restart();
@ -291,13 +312,15 @@ namespace jpgd
void init_progressive();
void init_sequential();
void decode_start();
void decode_init(jpeg_decoder_stream * pStream);
void decode_init(jpeg_decoder_stream* pStream, uint32_t flags);
void H2V2Convert();
uint32_t H2V2ConvertFiltered();
void H2V1Convert();
void H2V1ConvertFiltered();
void H1V2Convert();
void H1V2ConvertFiltered();
void H1V1Convert();
void gray_convert();
void expanded_convert();
void find_eoi();
inline uint get_char();
inline uint get_char(bool* pPadding_flag);
@ -307,7 +330,16 @@ namespace jpgd
inline uint get_bits_no_markers(int numbits);
inline int huff_decode(huff_tables* pH);
inline int huff_decode(huff_tables* pH, int& extrabits);
static inline uint8 clamp(int i);
// Clamps a value between 0-255.
static inline uint8 clamp(int i)
{
if (static_cast<uint>(i) > 255)
i = (((~i) >> 31) & 0xFF);
return static_cast<uint8>(i);
}
int decode_next_mcu_row();
static void decode_block_dc_first(jpeg_decoder* pD, int component_id, int block_x, int block_y);
static void decode_block_dc_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y);
static void decode_block_ac_first(jpeg_decoder* pD, int component_id, int block_x, int block_y);

462
ext/jpge/jpgd_idct.h Normal file
View File

@ -0,0 +1,462 @@
// Copyright 2009 Intel Corporation
// All Rights Reserved
//
// Permission is granted to use, copy, distribute and prepare derivative works of this
// software for any purpose and without fee, provided, that the above copyright notice
// and this statement appear in all copies. Intel makes no representations about the
// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS."
// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not
// assume any responsibility for any errors which may appear in this software nor any
// responsibility to update it.
//
// From:
// https://software.intel.com/sites/default/files/m/d/4/1/d/8/UsingIntelAVXToImplementIDCT-r1_5.pdf
// https://software.intel.com/file/29048
//
// Requires SSE
//
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <immintrin.h>
#ifdef _MSC_VER
#define JPGD_SIMD_ALIGN(type, name) __declspec(align(16)) type name
#else
#define JPGD_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
#endif
#define BITS_INV_ACC 4
#define SHIFT_INV_ROW 16 - BITS_INV_ACC
#define SHIFT_INV_COL 1 + BITS_INV_ACC
const short IRND_INV_ROW = 1024 * (6 - BITS_INV_ACC); //1 << (SHIFT_INV_ROW-1)
const short IRND_INV_COL = 16 * (BITS_INV_ACC - 3); // 1 << (SHIFT_INV_COL-1)
const short IRND_INV_CORR = IRND_INV_COL - 1; // correction -1.0 and round
JPGD_SIMD_ALIGN(short, shortM128_one_corr[8]) = {1, 1, 1, 1, 1, 1, 1, 1};
JPGD_SIMD_ALIGN(short, shortM128_round_inv_row[8]) = {IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0};
JPGD_SIMD_ALIGN(short, shortM128_round_inv_col[8]) = {IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL};
JPGD_SIMD_ALIGN(short, shortM128_round_inv_corr[8])= {IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR};
JPGD_SIMD_ALIGN(short, shortM128_tg_1_16[8]) = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; // tg * (2<<16) + 0.5
JPGD_SIMD_ALIGN(short, shortM128_tg_2_16[8]) = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; // tg * (2<<16) + 0.5
JPGD_SIMD_ALIGN(short, shortM128_tg_3_16[8]) = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; // tg * (2<<16) + 0.5
JPGD_SIMD_ALIGN(short, shortM128_cos_4_16[8]) = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};// cos * (2<<16) + 0.5
//-----------------------------------------------------------------------------
// Table for rows 0,4 - constants are multiplied on cos_4_16
// w15 w14 w11 w10 w07 w06 w03 w02
// w29 w28 w25 w24 w21 w20 w17 w16
// w31 w30 w27 w26 w23 w22 w19 w18
//movq -> w05 w04 w01 w00
JPGD_SIMD_ALIGN(short, shortM128_tab_i_04[]) = {
16384, 21407, 16384, 8867,
16384, -8867, 16384, -21407, // w13 w12 w09 w08
16384, 8867, -16384, -21407, // w07 w06 w03 w02
-16384, 21407, 16384, -8867, // w15 w14 w11 w10
22725, 19266, 19266, -4520, // w21 w20 w17 w16
12873, -22725, 4520, -12873, // w29 w28 w25 w24
12873, 4520, -22725, -12873, // w23 w22 w19 w18
4520, 19266, 19266, -22725}; // w31 w30 w27 w26
// Table for rows 1,7 - constants are multiplied on cos_1_16
//movq -> w05 w04 w01 w00
JPGD_SIMD_ALIGN(short, shortM128_tab_i_17[]) = {
22725, 29692, 22725, 12299,
22725, -12299, 22725, -29692, // w13 w12 w09 w08
22725, 12299, -22725, -29692, // w07 w06 w03 w02
-22725, 29692, 22725, -12299, // w15 w14 w11 w10
31521, 26722, 26722, -6270, // w21 w20 w17 w16
17855, -31521, 6270, -17855, // w29 w28 w25 w24
17855, 6270, -31521, -17855, // w23 w22 w19 w18
6270, 26722, 26722, -31521}; // w31 w30 w27 w26
// Table for rows 2,6 - constants are multiplied on cos_2_16
//movq -> w05 w04 w01 w00
JPGD_SIMD_ALIGN(short, shortM128_tab_i_26[]) = {
21407, 27969, 21407, 11585,
21407, -11585, 21407, -27969, // w13 w12 w09 w08
21407, 11585, -21407, -27969, // w07 w06 w03 w02
-21407, 27969, 21407, -11585, // w15 w14 w11 w10
29692, 25172, 25172, -5906, // w21 w20 w17 w16
16819, -29692, 5906, -16819, // w29 w28 w25 w24
16819, 5906, -29692, -16819, // w23 w22 w19 w18
5906, 25172, 25172, -29692}; // w31 w30 w27 w26
// Table for rows 3,5 - constants are multiplied on cos_3_16
//movq -> w05 w04 w01 w00
JPGD_SIMD_ALIGN(short, shortM128_tab_i_35[]) = {
19266, 25172, 19266, 10426,
19266, -10426, 19266, -25172, // w13 w12 w09 w08
19266, 10426, -19266, -25172, // w07 w06 w03 w02
-19266, 25172, 19266, -10426, // w15 w14 w11 w10
26722, 22654, 22654, -5315, // w21 w20 w17 w16
15137, -26722, 5315, -15137, // w29 w28 w25 w24
15137, 5315, -26722, -15137, // w23 w22 w19 w18
5315, 22654, 22654, -26722}; // w31 w30 w27 w26
JPGD_SIMD_ALIGN(short, shortM128_128[8]) = { 128, 128, 128, 128, 128, 128, 128, 128 };
void idctSSEShortU8(const short *pInput, uint8_t * pOutputUB)
{
__m128i r_xmm0, r_xmm4;
__m128i r_xmm1, r_xmm2, r_xmm3, r_xmm5, r_xmm6, r_xmm7;
__m128i row0, row1, row2, row3, row4, row5, row6, row7;
short * pTab_i_04 = shortM128_tab_i_04;
short * pTab_i_26 = shortM128_tab_i_26;
//Get pointers for this input and output
pTab_i_04 = shortM128_tab_i_04;
pTab_i_26 = shortM128_tab_i_26;
//Row 1 and Row 3
r_xmm0 = _mm_load_si128((__m128i *) pInput);
r_xmm4 = _mm_load_si128((__m128i *) (&pInput[2*8]));
// *** Work on the data in xmm0
//low shuffle mask = 0xd8 = 11 01 10 00
//get short 2 and short 0 into ls 32-bits
r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
// copy short 2 and short 0 to all locations
r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
// add to those copies
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
// shuffle mask = 0x55 = 01 01 01 01
// copy short 3 and short 1 to all locations
r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
// high shuffle mask = 0xd8 = 11 01 10 00
// get short 6 and short 4 into bit positions 64-95
// get short 7 and short 5 into bit positions 96-127
r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
// add to short 3 and short 1
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
// shuffle mask = 0xaa = 10 10 10 10
// copy short 6 and short 4 to all locations
r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
// shuffle mask = 0xaa = 11 11 11 11
// copy short 7 and short 5 to all locations
r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
// add to short 6 and short 4
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
// *** Work on the data in xmm4
// high shuffle mask = 0xd8 11 01 10 00
// get short 6 and short 4 into bit positions 64-95
// get short 7 and short 5 into bit positions 96-127
r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
// (xmm0 short 2 and short 0 plus pSi) + some constants
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0]));
r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
r_xmm2 = r_xmm1;
r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24]));
r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
r_xmm6 = r_xmm5;
r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
row0 = _mm_packs_epi32(r_xmm0, r_xmm2);
r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
row2 = _mm_packs_epi32(r_xmm4, r_xmm6);
//Row 5 and row 7
r_xmm0 = _mm_load_si128((__m128i *) (&pInput[4*8]));
r_xmm4 = _mm_load_si128((__m128i *) (&pInput[6*8]));
r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0]));
r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
r_xmm2 = r_xmm1;
r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24]));
r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
r_xmm6 = r_xmm5;
r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
row4 = _mm_packs_epi32(r_xmm0, r_xmm2);
r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
row6 = _mm_packs_epi32(r_xmm4, r_xmm6);
//Row 4 and row 2
pTab_i_04 = shortM128_tab_i_35;
pTab_i_26 = shortM128_tab_i_17;
r_xmm0 = _mm_load_si128((__m128i *) (&pInput[3*8]));
r_xmm4 = _mm_load_si128((__m128i *) (&pInput[1*8]));
r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0]));
r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
r_xmm2 = r_xmm1;
r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24]));
r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
r_xmm6 = r_xmm5;
r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
row3 = _mm_packs_epi32(r_xmm0, r_xmm2);
r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
row1 = _mm_packs_epi32(r_xmm4, r_xmm6);
//Row 6 and row 8
r_xmm0 = _mm_load_si128((__m128i *) (&pInput[5*8]));
r_xmm4 = _mm_load_si128((__m128i *) (&pInput[7*8]));
r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0]));
r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
r_xmm2 = r_xmm1;
r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24]));
r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
r_xmm6 = r_xmm5;
r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
row5 = _mm_packs_epi32(r_xmm0, r_xmm2);
r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
row7 = _mm_packs_epi32(r_xmm4, r_xmm6);
r_xmm1 = _mm_load_si128((__m128i *) shortM128_tg_3_16);
r_xmm2 = row5;
r_xmm3 = row3;
r_xmm0 = _mm_mulhi_epi16(row5, r_xmm1);
r_xmm1 = _mm_mulhi_epi16(r_xmm1, r_xmm3);
r_xmm5 = _mm_load_si128((__m128i *) shortM128_tg_1_16);
r_xmm6 = row7;
r_xmm4 = _mm_mulhi_epi16(row7, r_xmm5);
r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm2);
r_xmm5 = _mm_mulhi_epi16(r_xmm5, row1);
r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm3);
r_xmm7 = row6;
r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm3);
r_xmm3 = _mm_load_si128((__m128i *) shortM128_tg_2_16);
r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm1);
r_xmm7 = _mm_mulhi_epi16(r_xmm7, r_xmm3);
r_xmm1 = r_xmm0;
r_xmm3 = _mm_mulhi_epi16(r_xmm3, row2);
r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm6);
r_xmm4 = _mm_adds_epi16(r_xmm4, row1);
r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm4);
r_xmm0 = _mm_adds_epi16(r_xmm0, *((__m128i *) shortM128_one_corr));
r_xmm4 = _mm_subs_epi16(r_xmm4, r_xmm1);
r_xmm6 = r_xmm5;
r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm2);
r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_one_corr));
r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2);
//Intermediate results, needed later
__m128i temp3, temp7;
temp7 = r_xmm0;
r_xmm1 = r_xmm4;
r_xmm0 = _mm_load_si128((__m128i *) shortM128_cos_4_16);
r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm5);
r_xmm2 = _mm_load_si128((__m128i *) shortM128_cos_4_16);
r_xmm2 = _mm_mulhi_epi16(r_xmm2, r_xmm4);
//Intermediate results, needed later
temp3 = r_xmm6;
r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm5);
r_xmm7 = _mm_adds_epi16(r_xmm7, row2);
r_xmm3 = _mm_subs_epi16(r_xmm3, row6);
r_xmm6 = row0;
r_xmm0 = _mm_mulhi_epi16(r_xmm0, r_xmm1);
r_xmm5 = row4;
r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm6);
r_xmm6 = _mm_subs_epi16(r_xmm6, row4);
r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm2);
r_xmm4 = _mm_or_si128(r_xmm4, *((__m128i *) shortM128_one_corr));
r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm1);
r_xmm0 = _mm_or_si128(r_xmm0, *((__m128i *) shortM128_one_corr));
r_xmm2 = r_xmm5;
r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm7);
r_xmm1 = r_xmm6;
r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_round_inv_col));
r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm7);
r_xmm7 = temp7;
r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm3);
r_xmm6 = _mm_adds_epi16(r_xmm6, *((__m128i *) shortM128_round_inv_col));
r_xmm7 = _mm_adds_epi16(r_xmm7, r_xmm5);
r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL);
r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm3);
r_xmm1 = _mm_adds_epi16(r_xmm1, *((__m128i *) shortM128_round_inv_corr));
r_xmm3 = r_xmm6;
r_xmm2 = _mm_adds_epi16(r_xmm2, *((__m128i *) shortM128_round_inv_corr));
r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm4);
//Store results for row 0
//_mm_store_si128((__m128i *) pOutput, r_xmm7);
__m128i r0 = r_xmm7;
r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL);
r_xmm7 = r_xmm1;
r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm0);
//Store results for row 1
//_mm_store_si128((__m128i *) (&pOutput[1*8]), r_xmm6);
__m128i r1 = r_xmm6;
r_xmm1 = _mm_srai_epi16(r_xmm1, SHIFT_INV_COL);
r_xmm6 = temp3;
r_xmm7 = _mm_subs_epi16(r_xmm7, r_xmm0);
r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL);
//Store results for row 2
//_mm_store_si128((__m128i *) (&pOutput[2*8]), r_xmm1);
__m128i r2 = r_xmm1;
r_xmm5 = _mm_subs_epi16(r_xmm5, temp7);
r_xmm5 = _mm_srai_epi16(r_xmm5, SHIFT_INV_COL);
//Store results for row 7
//_mm_store_si128((__m128i *) (&pOutput[7*8]), r_xmm5);
__m128i r7 = r_xmm5;
r_xmm3 = _mm_subs_epi16(r_xmm3, r_xmm4);
r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2);
r_xmm2 = _mm_subs_epi16(r_xmm2, temp3);
r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL);
r_xmm2 = _mm_srai_epi16(r_xmm2, SHIFT_INV_COL);
//Store results for row 3
//_mm_store_si128((__m128i *) (&pOutput[3*8]), r_xmm6);
__m128i r3 = r_xmm6;
r_xmm3 = _mm_srai_epi16(r_xmm3, SHIFT_INV_COL);
//Store results for rows 4, 5, and 6
//_mm_store_si128((__m128i *) (&pOutput[4*8]), r_xmm2);
//_mm_store_si128((__m128i *) (&pOutput[5*8]), r_xmm7);
//_mm_store_si128((__m128i *) (&pOutput[6*8]), r_xmm3);
__m128i r4 = r_xmm2;
__m128i r5 = r_xmm7;
__m128i r6 = r_xmm3;
r0 = _mm_add_epi16(*(const __m128i *)shortM128_128, r0);
r1 = _mm_add_epi16(*(const __m128i *)shortM128_128, r1);
r2 = _mm_add_epi16(*(const __m128i *)shortM128_128, r2);
r3 = _mm_add_epi16(*(const __m128i *)shortM128_128, r3);
r4 = _mm_add_epi16(*(const __m128i *)shortM128_128, r4);
r5 = _mm_add_epi16(*(const __m128i *)shortM128_128, r5);
r6 = _mm_add_epi16(*(const __m128i *)shortM128_128, r6);
r7 = _mm_add_epi16(*(const __m128i *)shortM128_128, r7);
((__m128i *)pOutputUB)[0] = _mm_packus_epi16(r0, r1);
((__m128i *)pOutputUB)[1] = _mm_packus_epi16(r2, r3);
((__m128i *)pOutputUB)[2] = _mm_packus_epi16(r4, r5);
((__m128i *)pOutputUB)[3] = _mm_packus_epi16(r6, r7);
}

View File

@ -1,5 +1,24 @@
// jpge.cpp - C++ class for JPEG compression.
// Public domain, Rich Geldreich <richgel99@gmail.com>
// jpge.cpp - C++ class for JPEG compression. Richard Geldreich <richgel99@gmail.com>
// Supports grayscale, H1V1, H2V1, and H2V2 chroma subsampling factors, one or two pass Huffman table optimization, libjpeg-style quality 1-100 quality factors.
// Also supports using luma quantization tables for chroma.
//
// Released under two licenses. You are free to choose which license you want:
// License 1:
// Public Domain
//
// License 2:
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// v1.01, Dec. 18, 2010 - Initial release
// v1.02, Apr. 6, 2011 - Removed 2x2 ordered dither in H2V1 chroma subsampling method load_block_16_8_8(). (The rounding factor was 2, when it should have been 1. Either way, it wasn't helping.)
// v1.03, Apr. 16, 2011 - Added support for optimized Huffman code tables, optimized dynamic memory allocation down to only 1 alloc.
@ -7,13 +26,13 @@
// v1.04, May. 19, 2012: Forgot to set m_pFile ptr to NULL in cfile_stream::close(). Thanks to Owen Kaluza for reporting this bug.
// Code tweaks to fix VS2008 static code analysis warnings (all looked harmless).
// Code review revealed method load_block_16_8_8() (used for the non-default H2V1 sampling mode to downsample chroma) somehow didn't get the rounding factor fix from v1.02.
// v1.05, March 25, 2020: Added Apache 2.0 alternate license
#include "jpge.h"
#include <stdlib.h>
#include <string.h>
// Higher level wrappers/examples (optional).
#include <stdio.h>
#include <malloc.h>
#define JPGE_MAX(a,b) (((a)>(b))?(a):(b))
#define JPGE_MIN(a,b) (((a)<(b))?(a):(b))
@ -30,6 +49,11 @@ enum { DC_LUM_CODES = 12, AC_LUM_CODES = 256, DC_CHROMA_CODES = 12, AC_CHROMA_CO
static uint8 s_zag[64] = { 0,1,8,16,9,2,3,10,17,24,32,25,18,11,4,5,12,19,26,33,40,48,41,34,27,20,13,6,7,14,21,28,35,42,49,56,57,50,43,36,29,22,15,23,30,37,44,51,58,59,52,45,38,31,39,46,53,60,61,54,47,55,62,63 };
static int16 s_std_lum_quant[64] = { 16,11,12,14,12,10,16,14,13,14,18,17,16,19,24,40,26,24,22,22,24,49,35,37,29,40,58,51,61,60,57,51,56,55,64,72,92,78,64,68,87,69,55,56,80,109,81,87,95,98,103,104,103,62,77,113,121,112,100,120,92,101,103,99 };
static int16 s_std_croma_quant[64] = { 17,18,18,24,21,24,47,26,26,47,99,66,56,66,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99 };
// Table from http://www.imagemagick.org/discourse-server/viewtopic.php?f=22&t=20333&p=98008#p98008
// This is mozjpeg's default table, in zag order.
static int16 s_alt_quant[64] = { 16,16,16,16,17,16,18,20,20,18,25,27,24,27,25,37,34,31,31,34,37,56,40,43,40,43,40,56,85,53,62,53,53,62,53,85,75,91,74,69,74,91,75,135,106,94,94,106,135,156,131,124,131,156,189,169,169,189,238,226,238,311,311,418 };
static uint8 s_dc_lum_bits[17] = { 0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0 };
static uint8 s_dc_lum_val[DC_LUM_CODES] = { 0,1,2,3,4,5,6,7,8,9,10,11 };
static uint8 s_ac_lum_bits[17] = { 0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d };
@ -61,6 +85,11 @@ template <class T> inline void clear_obj(T &obj) { memset(&obj, 0, sizeof(obj));
const int YR = 19595, YG = 38470, YB = 7471, CB_R = -11059, CB_G = -21709, CB_B = 32768, CR_R = 32768, CR_G = -27439, CR_B = -5329;
static inline uint8 clamp(int i) { if (static_cast<uint>(i) > 255U) { if (i < 0) i = 0; else if (i > 255) i = 255; } return static_cast<uint8>(i); }
static inline int left_shifti(int val, uint32 bits)
{
return static_cast<int>(static_cast<uint32>(val) << bits);
}
static void RGB_to_YCC(uint8* pDst, const uint8* pSrc, int num_pixels)
{
for (; num_pixels; pDst += 3, pSrc += 3, num_pixels--)
@ -127,8 +156,8 @@ static void DCT2D(int32 *p)
{
int32 s0 = q[0], s1 = q[1], s2 = q[2], s3 = q[3], s4 = q[4], s5 = q[5], s6 = q[6], s7 = q[7];
DCT1D(s0, s1, s2, s3, s4, s5, s6, s7);
q[0] = s0 << ROW_BITS; q[1] = DCT_DESCALE(s1, CONST_BITS-ROW_BITS); q[2] = DCT_DESCALE(s2, CONST_BITS-ROW_BITS); q[3] = DCT_DESCALE(s3, CONST_BITS-ROW_BITS);
q[4] = s4 << ROW_BITS; q[5] = DCT_DESCALE(s5, CONST_BITS-ROW_BITS); q[6] = DCT_DESCALE(s6, CONST_BITS-ROW_BITS); q[7] = DCT_DESCALE(s7, CONST_BITS-ROW_BITS);
q[0] = left_shifti(s0, ROW_BITS); q[1] = DCT_DESCALE(s1, CONST_BITS - ROW_BITS); q[2] = DCT_DESCALE(s2, CONST_BITS - ROW_BITS); q[3] = DCT_DESCALE(s3, CONST_BITS - ROW_BITS);
q[4] = left_shifti(s4, ROW_BITS); q[5] = DCT_DESCALE(s5, CONST_BITS - ROW_BITS); q[6] = DCT_DESCALE(s6, CONST_BITS - ROW_BITS); q[7] = DCT_DESCALE(s7, CONST_BITS - ROW_BITS);
}
for (q = p, c = 7; c >= 0; c--, q++)
{
@ -169,8 +198,10 @@ static void calculate_minimum_redundancy(sym_freq *A, int n)
A[0].m_key += A[1].m_key; root = 0; leaf = 2;
for (next = 1; next < n - 1; next++)
{
if (leaf>=n || A[root].m_key<A[leaf].m_key) { A[next].m_key = A[root].m_key; A[root++].m_key = next; } else A[next].m_key = A[leaf++].m_key;
if (leaf>=n || (root<next && A[root].m_key<A[leaf].m_key)) { A[next].m_key += A[root].m_key; A[root++].m_key = next; } else A[next].m_key += A[leaf++].m_key;
if (leaf >= n || A[root].m_key < A[leaf].m_key) { A[next].m_key = A[root].m_key; A[root++].m_key = next; }
else A[next].m_key = A[leaf++].m_key;
if (leaf >= n || (root < next && A[root].m_key < A[leaf].m_key)) { A[next].m_key += A[root].m_key; A[root++].m_key = next; }
else A[next].m_key += A[leaf++].m_key;
}
A[n - 2].m_key = 0;
for (next = n - 3; next >= 0; next--) A[next].m_key = A[A[next].m_key].m_key + 1;
@ -487,8 +518,16 @@ bool jpeg_encoder::jpg_open(int p_x_res, int p_y_res, int src_channels)
for (int i = 1; i < m_mcu_y; i++)
m_mcu_lines[i] = m_mcu_lines[i - 1] + m_image_bpl_mcu;
if (m_params.m_use_std_tables)
{
compute_quant_table(m_quantization_tables[0], s_std_lum_quant);
compute_quant_table(m_quantization_tables[1], m_params.m_no_chroma_discrim_flag ? s_std_lum_quant : s_std_croma_quant);
}
else
{
compute_quant_table(m_quantization_tables[0], s_alt_quant);
memcpy(m_quantization_tables[1], m_quantization_tables[0], sizeof(m_quantization_tables[1]));
}
m_out_buf_left = JPGE_OUT_BUF_SIZE;
m_pOut_buf = m_out_buf;
@ -541,16 +580,14 @@ void jpeg_encoder::load_block_16_8(int x, int c)
uint8* pSrc1, * pSrc2;
sample_array_t* pDst = m_sample_array;
x = (x * (16 * 3)) + c;
int a = 0, b = 2;
for (int i = 0; i < 16; i += 2, pDst += 8)
{
pSrc1 = m_mcu_lines[i + 0] + x;
pSrc2 = m_mcu_lines[i + 1] + x;
pDst[0] = ((pSrc1[ 0 * 3] + pSrc1[ 1 * 3] + pSrc2[ 0 * 3] + pSrc2[ 1 * 3] + a) >> 2) - 128; pDst[1] = ((pSrc1[ 2 * 3] + pSrc1[ 3 * 3] + pSrc2[ 2 * 3] + pSrc2[ 3 * 3] + b) >> 2) - 128;
pDst[2] = ((pSrc1[ 4 * 3] + pSrc1[ 5 * 3] + pSrc2[ 4 * 3] + pSrc2[ 5 * 3] + a) >> 2) - 128; pDst[3] = ((pSrc1[ 6 * 3] + pSrc1[ 7 * 3] + pSrc2[ 6 * 3] + pSrc2[ 7 * 3] + b) >> 2) - 128;
pDst[4] = ((pSrc1[ 8 * 3] + pSrc1[ 9 * 3] + pSrc2[ 8 * 3] + pSrc2[ 9 * 3] + a) >> 2) - 128; pDst[5] = ((pSrc1[10 * 3] + pSrc1[11 * 3] + pSrc2[10 * 3] + pSrc2[11 * 3] + b) >> 2) - 128;
pDst[6] = ((pSrc1[12 * 3] + pSrc1[13 * 3] + pSrc2[12 * 3] + pSrc2[13 * 3] + a) >> 2) - 128; pDst[7] = ((pSrc1[14 * 3] + pSrc1[15 * 3] + pSrc2[14 * 3] + pSrc2[15 * 3] + b) >> 2) - 128;
int temp = a; a = b; b = temp;
pDst[0] = ((pSrc1[0 * 3] + pSrc1[1 * 3] + pSrc2[0 * 3] + pSrc2[1 * 3] + 2) >> 2) - 128; pDst[1] = ((pSrc1[2 * 3] + pSrc1[3 * 3] + pSrc2[2 * 3] + pSrc2[3 * 3] + 2) >> 2) - 128;
pDst[2] = ((pSrc1[4 * 3] + pSrc1[5 * 3] + pSrc2[4 * 3] + pSrc2[5 * 3] + 2) >> 2) - 128; pDst[3] = ((pSrc1[6 * 3] + pSrc1[7 * 3] + pSrc2[6 * 3] + pSrc2[7 * 3] + 2) >> 2) - 128;
pDst[4] = ((pSrc1[8 * 3] + pSrc1[9 * 3] + pSrc2[8 * 3] + pSrc2[9 * 3] + 2) >> 2) - 128; pDst[5] = ((pSrc1[10 * 3] + pSrc1[11 * 3] + pSrc2[10 * 3] + pSrc2[11 * 3] + 2) >> 2) - 128;
pDst[6] = ((pSrc1[12 * 3] + pSrc1[13 * 3] + pSrc2[12 * 3] + pSrc2[13 * 3] + 2) >> 2) - 128; pDst[7] = ((pSrc1[14 * 3] + pSrc1[15 * 3] + pSrc2[14 * 3] + pSrc2[15 * 3] + 2) >> 2) - 128;
}
}
@ -562,10 +599,10 @@ void jpeg_encoder::load_block_16_8_8(int x, int c)
for (int i = 0; i < 8; i++, pDst += 8)
{
pSrc1 = m_mcu_lines[i + 0] + x;
pDst[0] = ((pSrc1[ 0 * 3] + pSrc1[ 1 * 3]) >> 1) - 128; pDst[1] = ((pSrc1[ 2 * 3] + pSrc1[ 3 * 3]) >> 1) - 128;
pDst[2] = ((pSrc1[ 4 * 3] + pSrc1[ 5 * 3]) >> 1) - 128; pDst[3] = ((pSrc1[ 6 * 3] + pSrc1[ 7 * 3]) >> 1) - 128;
pDst[4] = ((pSrc1[ 8 * 3] + pSrc1[ 9 * 3]) >> 1) - 128; pDst[5] = ((pSrc1[10 * 3] + pSrc1[11 * 3]) >> 1) - 128;
pDst[6] = ((pSrc1[12 * 3] + pSrc1[13 * 3]) >> 1) - 128; pDst[7] = ((pSrc1[14 * 3] + pSrc1[15 * 3]) >> 1) - 128;
pDst[0] = ((pSrc1[0 * 3] + pSrc1[1 * 3] + 1) >> 1) - 128; pDst[1] = ((pSrc1[2 * 3] + pSrc1[3 * 3] + 1) >> 1) - 128;
pDst[2] = ((pSrc1[4 * 3] + pSrc1[5 * 3] + 1) >> 1) - 128; pDst[3] = ((pSrc1[6 * 3] + pSrc1[7 * 3] + 1) >> 1) - 128;
pDst[4] = ((pSrc1[8 * 3] + pSrc1[9 * 3] + 1) >> 1) - 128; pDst[5] = ((pSrc1[10 * 3] + pSrc1[11 * 3] + 1) >> 1) - 128;
pDst[6] = ((pSrc1[12 * 3] + pSrc1[13 * 3] + 1) >> 1) - 128; pDst[7] = ((pSrc1[14 * 3] + pSrc1[15 * 3] + 1) >> 1) - 128;
}
}
@ -896,6 +933,86 @@ bool jpeg_encoder::process_scanline(const void* pScanline)
return m_all_stream_writes_succeeded;
}
// Higher level wrappers/examples (optional).
#include <stdio.h>
class cfile_stream : public output_stream
{
cfile_stream(const cfile_stream&);
cfile_stream& operator= (const cfile_stream&);
FILE* m_pFile;
bool m_bStatus;
public:
cfile_stream() : m_pFile(NULL), m_bStatus(false) { }
virtual ~cfile_stream()
{
close();
}
bool open(const char* pFilename)
{
close();
m_pFile = fopen(pFilename, "wb");
m_bStatus = (m_pFile != NULL);
return m_bStatus;
}
bool close()
{
if (m_pFile)
{
if (fclose(m_pFile) == EOF)
{
m_bStatus = false;
}
m_pFile = NULL;
}
return m_bStatus;
}
virtual bool put_buf(const void* pBuf, int len)
{
m_bStatus = m_bStatus && (fwrite(pBuf, len, 1, m_pFile) == 1);
return m_bStatus;
}
uint get_size() const
{
return m_pFile ? ftell(m_pFile) : 0;
}
};
// Writes JPEG image to file.
bool compress_image_to_jpeg_file(const char* pFilename, int width, int height, int num_channels, const uint8* pImage_data, const params& comp_params)
{
cfile_stream dst_stream;
if (!dst_stream.open(pFilename))
return false;
jpge::jpeg_encoder dst_image;
if (!dst_image.init(&dst_stream, width, height, num_channels, comp_params))
return false;
for (uint pass_index = 0; pass_index < dst_image.get_total_passes(); pass_index++)
{
for (int i = 0; i < height; i++)
{
const uint8* pBuf = pImage_data + i * width * num_channels;
if (!dst_image.process_scanline(pBuf))
return false;
}
if (!dst_image.process_scanline(NULL))
return false;
}
dst_image.deinit();
return dst_stream.close();
}
class memory_stream : public output_stream
{
memory_stream(const memory_stream&);

View File

@ -1,5 +1,5 @@
// jpge.h - C++ class for JPEG compression.
// Public domain, Rich Geldreich <richgel99@gmail.com>
// Public Domain or Apache 2.0, Richard Geldreich <richgel99@gmail.com>
// Alex Evans: Added RGBA support, linear memory allocator.
#ifndef JPEG_ENCODER_H
#define JPEG_ENCODER_H
@ -19,7 +19,7 @@ namespace jpge
// JPEG compression parameters structure.
struct params
{
inline params() : m_quality(85), m_subsampling(H2V2), m_no_chroma_discrim_flag(false), m_two_pass_flag(false) { }
inline params() : m_quality(85), m_subsampling(H2V2), m_no_chroma_discrim_flag(false), m_two_pass_flag(false), m_use_std_tables(false) { }
inline bool check() const
{
@ -43,6 +43,10 @@ namespace jpge
bool m_no_chroma_discrim_flag;
bool m_two_pass_flag;
// By default we use the same quantization tables as mozjpeg's default.
// Set to true to use the traditional tables from JPEG Annex K.
bool m_use_std_tables;
};
// Writes JPEG image to a file.

View File

@ -1 +0,0 @@
This is https://code.google.com/p/jpeg-compressor/ , public domain code.