diff --git a/camera/video4linux2.c b/camera/video4linux2.c index d858881559..e2c104ab52 100644 --- a/camera/video4linux2.c +++ b/camera/video4linux2.c @@ -20,6 +20,7 @@ #include #include #include +#include "../general.h" #include "../driver.h" #include "../performance.h" #include "../miscellaneous.h" @@ -49,8 +50,8 @@ typedef struct video4linux int fd; struct buffer *buffers; unsigned n_buffers; - size_t width; - size_t height; + unsigned width; + unsigned height; size_t pitch; struct scaler_ctx scaler; @@ -231,6 +232,8 @@ static bool init_device(void *data) return false; } + RARCH_LOG("V4L2 device: %u x %u.\n", v4l->width, v4l->height); + return init_mmap(v4l); } @@ -314,9 +317,9 @@ static void *v4l_init(const char *device, uint64_t caps, unsigned width, unsigne strlcpy(v4l->dev_name, device ? device : "/dev/video0", sizeof(v4l->dev_name)); - v4l->width = width; - v4l->height = height; - v4l->ready = false; + v4l->width = width; + v4l->height = height; + v4l->ready = false; if (stat(v4l->dev_name, &st) == -1) { diff --git a/gfx/scaler/pixconv.c b/gfx/scaler/pixconv.c index 2b7b7fa8a6..20dc9315ff 100644 --- a/gfx/scaler/pixconv.c +++ b/gfx/scaler/pixconv.c @@ -682,6 +682,135 @@ static inline uint8_t clamp_8bit(int val) return val; } +#define YUV_SHIFT 6 +#define YUV_OFFSET (1 << (YUV_SHIFT - 1)) +#define YUV_MAT_Y (1 << 6) +#define YUV_MAT_U_G (-22) +#define YUV_MAT_U_B (113) +#define YUV_MAT_V_R (90) +#define YUV_MAT_V_G (-46) +#if defined(__SSE2__) +void conv_yuyv_argb8888(void *output_, const void *input_, + int width, int height, + int out_stride, int in_stride) +{ + int h, w; + const uint8_t *input = (const uint8_t*)input_; + uint32_t *output = (uint32_t*)output_; + + const __m128i mask_y = _mm_set1_epi16(0xffu); + const __m128i mask_u = _mm_set1_epi32(0xffu << 8); + const __m128i mask_v = _mm_set1_epi32(0xffu << 24); + const __m128i chroma_offset = _mm_set1_epi16(128); + const __m128i round_offset = _mm_set1_epi16(YUV_OFFSET); + + const __m128i yuv_mul = _mm_set1_epi16(YUV_MAT_Y); + const __m128i u_g_mul = _mm_set1_epi16(YUV_MAT_U_G); + const __m128i u_b_mul = _mm_set1_epi16(YUV_MAT_U_B); + const __m128i v_r_mul = _mm_set1_epi16(YUV_MAT_V_R); + const __m128i v_g_mul = _mm_set1_epi16(YUV_MAT_V_G); + const __m128i a = _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()); + + for (h = 0; h < height; h++, output += out_stride >> 2, input += in_stride) + { + const uint8_t *src = input; + uint32_t *dst = output; + + // Each loop processes 16 pixels. + for (w = 0; w + 16 <= width; w += 16, src += 32, dst += 16) + { + __m128i yuv0 = _mm_loadu_si128((const __m128i*)(src + 0)); // [Y0, U0, Y1, V0, Y2, U1, Y3, V1, ...] + __m128i yuv1 = _mm_loadu_si128((const __m128i*)(src + 16)); // [Y0, U0, Y1, V0, Y2, U1, Y3, V1, ...] + + __m128i y0 = _mm_and_si128(yuv0, mask_y); // [Y0, Y1, Y2, ...] (16-bit) + __m128i u0 = _mm_and_si128(yuv0, mask_u); // [0, U0, 0, 0, 0, U1, 0, 0, ...] + __m128i v0 = _mm_and_si128(yuv0, mask_v); // [0, 0, 0, V1, 0, , 0, V1, ...] + __m128i y1 = _mm_and_si128(yuv1, mask_y); // [Y0, Y1, Y2, ...] (16-bit) + __m128i u1 = _mm_and_si128(yuv1, mask_u); // [0, U0, 0, 0, 0, U1, 0, 0, ...] + __m128i v1 = _mm_and_si128(yuv1, mask_v); // [0, 0, 0, V1, 0, , 0, V1, ...] + + // Juggle around to get U and V in the same 16-bit format as Y. + u0 = _mm_srli_si128(u0, 1); + v0 = _mm_srli_si128(v0, 3); + u1 = _mm_srli_si128(u1, 1); + v1 = _mm_srli_si128(v1, 3); + __m128i u = _mm_packs_epi32(u0, u1); + __m128i v = _mm_packs_epi32(v0, v1); + + // Apply YUV offsets (U, V) -= (-128, -128) + u = _mm_sub_epi16(u, chroma_offset); + v = _mm_sub_epi16(v, chroma_offset); + + // Upscale chroma horizontally (nearest) + u0 = _mm_unpacklo_epi16(u, u); + u1 = _mm_unpackhi_epi16(u, u); + v0 = _mm_unpacklo_epi16(v, v); + v1 = _mm_unpackhi_epi16(v, v); + + // Apply transformations + y0 = _mm_mullo_epi16(y0, yuv_mul); + y1 = _mm_mullo_epi16(y1, yuv_mul); + __m128i u0_g = _mm_mullo_epi16(u0, u_g_mul); + __m128i u1_g = _mm_mullo_epi16(u1, u_g_mul); + __m128i u0_b = _mm_mullo_epi16(u0, u_b_mul); + __m128i u1_b = _mm_mullo_epi16(u1, u_b_mul); + __m128i v0_r = _mm_mullo_epi16(v0, v_r_mul); + __m128i v1_r = _mm_mullo_epi16(v1, v_r_mul); + __m128i v0_g = _mm_mullo_epi16(v0, v_g_mul); + __m128i v1_g = _mm_mullo_epi16(v1, v_g_mul); + + // Add contibutions from the transformed components. + __m128i r0 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(y0, v0_r), round_offset), YUV_SHIFT); + __m128i g0 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(_mm_adds_epi16(y0, v0_g), u0_g), round_offset), YUV_SHIFT); + __m128i b0 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(y0, u0_b), round_offset), YUV_SHIFT); + + __m128i r1 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(y1, v1_r), round_offset), YUV_SHIFT); + __m128i g1 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(_mm_adds_epi16(y1, v1_g), u1_g), round_offset), YUV_SHIFT); + __m128i b1 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(y1, u1_b), round_offset), YUV_SHIFT); + + // Saturate into 8-bit. + r0 = _mm_packus_epi16(r0, r1); + g0 = _mm_packus_epi16(g0, g1); + b0 = _mm_packus_epi16(b0, b1); + + // Interleave into ARGB. + __m128i res_lo_bg = _mm_unpacklo_epi8(b0, g0); + __m128i res_hi_bg = _mm_unpackhi_epi8(b0, g0); + __m128i res_lo_ra = _mm_unpacklo_epi8(r0, a); + __m128i res_hi_ra = _mm_unpackhi_epi8(r0, a); + __m128i res0 = _mm_unpacklo_epi16(res_lo_bg, res_lo_ra); + __m128i res1 = _mm_unpackhi_epi16(res_lo_bg, res_lo_ra); + __m128i res2 = _mm_unpacklo_epi16(res_hi_bg, res_hi_ra); + __m128i res3 = _mm_unpackhi_epi16(res_hi_bg, res_hi_ra); + + _mm_storeu_si128((__m128i*)(dst + 0), res0); + _mm_storeu_si128((__m128i*)(dst + 4), res1); + _mm_storeu_si128((__m128i*)(dst + 8), res2); + _mm_storeu_si128((__m128i*)(dst + 12), res3); + } + + // Finish off the rest (if any) in C. + for (; w < width; w += 2, src += 4, dst += 2) + { + int y0 = src[0]; + int u = src[1] - 128; + int y1 = src[2]; + int v = src[3] - 128; + + uint8_t r0 = clamp_8bit((YUV_MAT_Y * y0 + YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT); + uint8_t g0 = clamp_8bit((YUV_MAT_Y * y0 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT); + uint8_t b0 = clamp_8bit((YUV_MAT_Y * y0 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT); + + uint8_t r1 = clamp_8bit((YUV_MAT_Y * y1 + YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT); + uint8_t g1 = clamp_8bit((YUV_MAT_Y * y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT); + uint8_t b1 = clamp_8bit((YUV_MAT_Y * y1 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT); + + dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0); + dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0); + } + } +} +#else void conv_yuyv_argb8888(void *output_, const void *input_, int width, int height, int out_stride, int in_stride) @@ -697,24 +826,25 @@ void conv_yuyv_argb8888(void *output_, const void *input_, for (w = 0; w < width; w += 2, src += 4, dst += 2) { - int y0 = src[0] - 16; + int y0 = src[0]; int u = src[1] - 128; - int y1 = src[2] - 16; + int y1 = src[2]; int v = src[3] - 128; - uint8_t r0 = clamp_8bit((298 * y0 + 409 * v + 128) >> 8); - uint8_t g0 = clamp_8bit((298 * y0 - 100 * u - 208 * v + 128) >> 8); - uint8_t b0 = clamp_8bit((298 * y0 + 516 * u + 128) >> 8); + uint8_t r0 = clamp_8bit((YUV_MAT_Y * y0 + YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT); + uint8_t g0 = clamp_8bit((YUV_MAT_Y * y0 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT); + uint8_t b0 = clamp_8bit((YUV_MAT_Y * y0 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT); - uint8_t r1 = clamp_8bit((298 * y1 + 409 * v + 128) >> 8); - uint8_t g1 = clamp_8bit((298 * y1 - 100 * u - 208 * v + 128) >> 8); - uint8_t b1 = clamp_8bit((298 * y1 + 516 * u + 128) >> 8); + uint8_t r1 = clamp_8bit((YUV_MAT_Y * y1 + YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT); + uint8_t g1 = clamp_8bit((YUV_MAT_Y * y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT); + uint8_t b1 = clamp_8bit((YUV_MAT_Y * y1 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT); dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0); dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0); } } } +#endif void conv_copy(void *output_, const void *input_, int width, int height,