mirror of
https://github.com/libretro/RetroArch.git
synced 2024-11-27 10:10:57 +00:00
Start implementing SSE2 conversion of YUV.
This commit is contained in:
parent
c4176564c5
commit
44937348e8
@ -20,6 +20,7 @@
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#include "../general.h"
|
||||
#include "../driver.h"
|
||||
#include "../performance.h"
|
||||
#include "../miscellaneous.h"
|
||||
@ -49,8 +50,8 @@ typedef struct video4linux
|
||||
int fd;
|
||||
struct buffer *buffers;
|
||||
unsigned n_buffers;
|
||||
size_t width;
|
||||
size_t height;
|
||||
unsigned width;
|
||||
unsigned height;
|
||||
size_t pitch;
|
||||
|
||||
struct scaler_ctx scaler;
|
||||
@ -231,6 +232,8 @@ static bool init_device(void *data)
|
||||
return false;
|
||||
}
|
||||
|
||||
RARCH_LOG("V4L2 device: %u x %u.\n", v4l->width, v4l->height);
|
||||
|
||||
return init_mmap(v4l);
|
||||
}
|
||||
|
||||
@ -314,9 +317,9 @@ static void *v4l_init(const char *device, uint64_t caps, unsigned width, unsigne
|
||||
|
||||
strlcpy(v4l->dev_name, device ? device : "/dev/video0", sizeof(v4l->dev_name));
|
||||
|
||||
v4l->width = width;
|
||||
v4l->height = height;
|
||||
v4l->ready = false;
|
||||
v4l->width = width;
|
||||
v4l->height = height;
|
||||
v4l->ready = false;
|
||||
|
||||
if (stat(v4l->dev_name, &st) == -1)
|
||||
{
|
||||
|
@ -682,6 +682,135 @@ static inline uint8_t clamp_8bit(int val)
|
||||
return val;
|
||||
}
|
||||
|
||||
#define YUV_SHIFT 6
|
||||
#define YUV_OFFSET (1 << (YUV_SHIFT - 1))
|
||||
#define YUV_MAT_Y (1 << 6)
|
||||
#define YUV_MAT_U_G (-22)
|
||||
#define YUV_MAT_U_B (113)
|
||||
#define YUV_MAT_V_R (90)
|
||||
#define YUV_MAT_V_G (-46)
|
||||
#if defined(__SSE2__)
|
||||
void conv_yuyv_argb8888(void *output_, const void *input_,
|
||||
int width, int height,
|
||||
int out_stride, int in_stride)
|
||||
{
|
||||
int h, w;
|
||||
const uint8_t *input = (const uint8_t*)input_;
|
||||
uint32_t *output = (uint32_t*)output_;
|
||||
|
||||
const __m128i mask_y = _mm_set1_epi16(0xffu);
|
||||
const __m128i mask_u = _mm_set1_epi32(0xffu << 8);
|
||||
const __m128i mask_v = _mm_set1_epi32(0xffu << 24);
|
||||
const __m128i chroma_offset = _mm_set1_epi16(128);
|
||||
const __m128i round_offset = _mm_set1_epi16(YUV_OFFSET);
|
||||
|
||||
const __m128i yuv_mul = _mm_set1_epi16(YUV_MAT_Y);
|
||||
const __m128i u_g_mul = _mm_set1_epi16(YUV_MAT_U_G);
|
||||
const __m128i u_b_mul = _mm_set1_epi16(YUV_MAT_U_B);
|
||||
const __m128i v_r_mul = _mm_set1_epi16(YUV_MAT_V_R);
|
||||
const __m128i v_g_mul = _mm_set1_epi16(YUV_MAT_V_G);
|
||||
const __m128i a = _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128());
|
||||
|
||||
for (h = 0; h < height; h++, output += out_stride >> 2, input += in_stride)
|
||||
{
|
||||
const uint8_t *src = input;
|
||||
uint32_t *dst = output;
|
||||
|
||||
// Each loop processes 16 pixels.
|
||||
for (w = 0; w + 16 <= width; w += 16, src += 32, dst += 16)
|
||||
{
|
||||
__m128i yuv0 = _mm_loadu_si128((const __m128i*)(src + 0)); // [Y0, U0, Y1, V0, Y2, U1, Y3, V1, ...]
|
||||
__m128i yuv1 = _mm_loadu_si128((const __m128i*)(src + 16)); // [Y0, U0, Y1, V0, Y2, U1, Y3, V1, ...]
|
||||
|
||||
__m128i y0 = _mm_and_si128(yuv0, mask_y); // [Y0, Y1, Y2, ...] (16-bit)
|
||||
__m128i u0 = _mm_and_si128(yuv0, mask_u); // [0, U0, 0, 0, 0, U1, 0, 0, ...]
|
||||
__m128i v0 = _mm_and_si128(yuv0, mask_v); // [0, 0, 0, V1, 0, , 0, V1, ...]
|
||||
__m128i y1 = _mm_and_si128(yuv1, mask_y); // [Y0, Y1, Y2, ...] (16-bit)
|
||||
__m128i u1 = _mm_and_si128(yuv1, mask_u); // [0, U0, 0, 0, 0, U1, 0, 0, ...]
|
||||
__m128i v1 = _mm_and_si128(yuv1, mask_v); // [0, 0, 0, V1, 0, , 0, V1, ...]
|
||||
|
||||
// Juggle around to get U and V in the same 16-bit format as Y.
|
||||
u0 = _mm_srli_si128(u0, 1);
|
||||
v0 = _mm_srli_si128(v0, 3);
|
||||
u1 = _mm_srli_si128(u1, 1);
|
||||
v1 = _mm_srli_si128(v1, 3);
|
||||
__m128i u = _mm_packs_epi32(u0, u1);
|
||||
__m128i v = _mm_packs_epi32(v0, v1);
|
||||
|
||||
// Apply YUV offsets (U, V) -= (-128, -128)
|
||||
u = _mm_sub_epi16(u, chroma_offset);
|
||||
v = _mm_sub_epi16(v, chroma_offset);
|
||||
|
||||
// Upscale chroma horizontally (nearest)
|
||||
u0 = _mm_unpacklo_epi16(u, u);
|
||||
u1 = _mm_unpackhi_epi16(u, u);
|
||||
v0 = _mm_unpacklo_epi16(v, v);
|
||||
v1 = _mm_unpackhi_epi16(v, v);
|
||||
|
||||
// Apply transformations
|
||||
y0 = _mm_mullo_epi16(y0, yuv_mul);
|
||||
y1 = _mm_mullo_epi16(y1, yuv_mul);
|
||||
__m128i u0_g = _mm_mullo_epi16(u0, u_g_mul);
|
||||
__m128i u1_g = _mm_mullo_epi16(u1, u_g_mul);
|
||||
__m128i u0_b = _mm_mullo_epi16(u0, u_b_mul);
|
||||
__m128i u1_b = _mm_mullo_epi16(u1, u_b_mul);
|
||||
__m128i v0_r = _mm_mullo_epi16(v0, v_r_mul);
|
||||
__m128i v1_r = _mm_mullo_epi16(v1, v_r_mul);
|
||||
__m128i v0_g = _mm_mullo_epi16(v0, v_g_mul);
|
||||
__m128i v1_g = _mm_mullo_epi16(v1, v_g_mul);
|
||||
|
||||
// Add contibutions from the transformed components.
|
||||
__m128i r0 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(y0, v0_r), round_offset), YUV_SHIFT);
|
||||
__m128i g0 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(_mm_adds_epi16(y0, v0_g), u0_g), round_offset), YUV_SHIFT);
|
||||
__m128i b0 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(y0, u0_b), round_offset), YUV_SHIFT);
|
||||
|
||||
__m128i r1 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(y1, v1_r), round_offset), YUV_SHIFT);
|
||||
__m128i g1 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(_mm_adds_epi16(y1, v1_g), u1_g), round_offset), YUV_SHIFT);
|
||||
__m128i b1 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(y1, u1_b), round_offset), YUV_SHIFT);
|
||||
|
||||
// Saturate into 8-bit.
|
||||
r0 = _mm_packus_epi16(r0, r1);
|
||||
g0 = _mm_packus_epi16(g0, g1);
|
||||
b0 = _mm_packus_epi16(b0, b1);
|
||||
|
||||
// Interleave into ARGB.
|
||||
__m128i res_lo_bg = _mm_unpacklo_epi8(b0, g0);
|
||||
__m128i res_hi_bg = _mm_unpackhi_epi8(b0, g0);
|
||||
__m128i res_lo_ra = _mm_unpacklo_epi8(r0, a);
|
||||
__m128i res_hi_ra = _mm_unpackhi_epi8(r0, a);
|
||||
__m128i res0 = _mm_unpacklo_epi16(res_lo_bg, res_lo_ra);
|
||||
__m128i res1 = _mm_unpackhi_epi16(res_lo_bg, res_lo_ra);
|
||||
__m128i res2 = _mm_unpacklo_epi16(res_hi_bg, res_hi_ra);
|
||||
__m128i res3 = _mm_unpackhi_epi16(res_hi_bg, res_hi_ra);
|
||||
|
||||
_mm_storeu_si128((__m128i*)(dst + 0), res0);
|
||||
_mm_storeu_si128((__m128i*)(dst + 4), res1);
|
||||
_mm_storeu_si128((__m128i*)(dst + 8), res2);
|
||||
_mm_storeu_si128((__m128i*)(dst + 12), res3);
|
||||
}
|
||||
|
||||
// Finish off the rest (if any) in C.
|
||||
for (; w < width; w += 2, src += 4, dst += 2)
|
||||
{
|
||||
int y0 = src[0];
|
||||
int u = src[1] - 128;
|
||||
int y1 = src[2];
|
||||
int v = src[3] - 128;
|
||||
|
||||
uint8_t r0 = clamp_8bit((YUV_MAT_Y * y0 + YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT);
|
||||
uint8_t g0 = clamp_8bit((YUV_MAT_Y * y0 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
|
||||
uint8_t b0 = clamp_8bit((YUV_MAT_Y * y0 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT);
|
||||
|
||||
uint8_t r1 = clamp_8bit((YUV_MAT_Y * y1 + YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT);
|
||||
uint8_t g1 = clamp_8bit((YUV_MAT_Y * y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
|
||||
uint8_t b1 = clamp_8bit((YUV_MAT_Y * y1 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT);
|
||||
|
||||
dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
|
||||
dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
void conv_yuyv_argb8888(void *output_, const void *input_,
|
||||
int width, int height,
|
||||
int out_stride, int in_stride)
|
||||
@ -697,24 +826,25 @@ void conv_yuyv_argb8888(void *output_, const void *input_,
|
||||
|
||||
for (w = 0; w < width; w += 2, src += 4, dst += 2)
|
||||
{
|
||||
int y0 = src[0] - 16;
|
||||
int y0 = src[0];
|
||||
int u = src[1] - 128;
|
||||
int y1 = src[2] - 16;
|
||||
int y1 = src[2];
|
||||
int v = src[3] - 128;
|
||||
|
||||
uint8_t r0 = clamp_8bit((298 * y0 + 409 * v + 128) >> 8);
|
||||
uint8_t g0 = clamp_8bit((298 * y0 - 100 * u - 208 * v + 128) >> 8);
|
||||
uint8_t b0 = clamp_8bit((298 * y0 + 516 * u + 128) >> 8);
|
||||
uint8_t r0 = clamp_8bit((YUV_MAT_Y * y0 + YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT);
|
||||
uint8_t g0 = clamp_8bit((YUV_MAT_Y * y0 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
|
||||
uint8_t b0 = clamp_8bit((YUV_MAT_Y * y0 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT);
|
||||
|
||||
uint8_t r1 = clamp_8bit((298 * y1 + 409 * v + 128) >> 8);
|
||||
uint8_t g1 = clamp_8bit((298 * y1 - 100 * u - 208 * v + 128) >> 8);
|
||||
uint8_t b1 = clamp_8bit((298 * y1 + 516 * u + 128) >> 8);
|
||||
uint8_t r1 = clamp_8bit((YUV_MAT_Y * y1 + YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT);
|
||||
uint8_t g1 = clamp_8bit((YUV_MAT_Y * y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
|
||||
uint8_t b1 = clamp_8bit((YUV_MAT_Y * y1 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT);
|
||||
|
||||
dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
|
||||
dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void conv_copy(void *output_, const void *input_,
|
||||
int width, int height,
|
||||
|
Loading…
Reference in New Issue
Block a user