Add custom scaling routines.

Implemented for point, bilinear, lanczos.
Partly optimized for SSE2.
This commit is contained in:
Themaister 2012-09-02 14:30:46 +02:00
parent 22e43d4d84
commit 19fa31f17d
12 changed files with 1176 additions and 41 deletions

View File

@ -126,6 +126,7 @@ endif
ifeq ($(HAVE_SDL), 1)
OBJ += gfx/sdl_gfx.o gfx/context/sdl_ctx.o input/sdl_input.o audio/sdl_audio.o fifo_buffer.o
OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
DEFINES += $(SDL_CFLAGS) $(BSD_LOCAL_INC)
LIBS += $(SDL_LIBS)

View File

@ -62,6 +62,7 @@ endif
ifeq ($(HAVE_SDL), 1)
OBJ += gfx/sdl_gfx.o gfx/gl.o gfx/math/matrix.o gfx/fonts/freetype.o gfx/context/sdl_ctx.o input/sdl_input.o audio/sdl_audio.o fifo_buffer.o
OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
LIBS += -lSDL
DEFINES += -ISDL -DHAVE_SDL
endif

250
gfx/scaler/filter.c Normal file
View File

@ -0,0 +1,250 @@
#include "filter.h"
#include <math.h>
#include <stdio.h>
#include <string.h>
static bool allocate_filters(struct scaler_ctx *ctx)
{
ctx->horiz.filter = (int16_t*)scaler_alloc(sizeof(int16_t), ctx->horiz.filter_stride * ctx->out_width);
ctx->horiz.filter_pos = (int*)scaler_alloc(sizeof(int), ctx->out_width);
ctx->vert.filter = (int16_t*)scaler_alloc(sizeof(int16_t), ctx->vert.filter_stride * ctx->out_height);
ctx->vert.filter_pos = (int*)scaler_alloc(sizeof(int), ctx->out_height);
return ctx->horiz.filter && ctx->vert.filter;
}
static void gen_filter_point_sub(struct scaler_filter *filter, int len, int pos, int step)
{
for (int i = 0; i < len; i++, pos += step)
{
filter->filter_pos[i] = pos >> 16;
filter->filter[i] = FILTER_UNITY;
}
}
static bool gen_filter_point(struct scaler_ctx *ctx)
{
ctx->horiz.filter_len = 1;
ctx->horiz.filter_stride = 1;
ctx->vert.filter_len = 1;
ctx->vert.filter_stride = 1;
if (!allocate_filters(ctx))
return false;
int x_pos = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15);
int x_step = (1 << 16) * ctx->in_width / ctx->out_width;
int y_pos = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
int y_step = (1 << 16) * ctx->in_height / ctx->out_height;
gen_filter_point_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
gen_filter_point_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
return true;
}
static void gen_filter_bilinear_sub(struct scaler_filter *filter, int len, int pos, int step)
{
for (int i = 0; i < len; i++, pos += step)
{
filter->filter_pos[i] = pos >> 16;
filter->filter[i * 2 + 1] = (pos & 0xffff) >> 2;
filter->filter[i * 2 + 0] = FILTER_UNITY - filter->filter[i * 2 + 1];
}
}
static bool gen_filter_bilinear(struct scaler_ctx *ctx)
{
ctx->horiz.filter_len = 2;
ctx->horiz.filter_stride = 2;
ctx->vert.filter_len = 2;
ctx->vert.filter_stride = 2;
if (!allocate_filters(ctx))
return false;
int x_pos = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15);
int x_step = (1 << 16) * ctx->in_width / ctx->out_width;
int y_pos = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
int y_step = (1 << 16) * ctx->in_height / ctx->out_height;
gen_filter_bilinear_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
gen_filter_bilinear_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
return true;
}
static inline double sinc(double phase)
{
if (fabs(phase) < 0.0001)
return 1.0;
else
return sin(phase) / phase;
}
static inline unsigned next_pow2(unsigned v)
{
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;
}
static void gen_filter_sinc_sub(struct scaler_filter *filter, int len, int pos, int step, double phase_mul)
{
const int sinc_size = filter->filter_len;
for (int i = 0; i < len; i++, pos += step)
{
filter->filter_pos[i] = pos >> 16;
//int16_t sinc_sum = 0;
for (int j = 0; j < sinc_size; j++)
{
double sinc_phase = M_PI * ((double)((sinc_size << 15) + (pos & 0xffff)) / 0x10000 - j);
double lanczos_phase = sinc_phase / ((sinc_size >> 1));
int16_t sinc_val = FILTER_UNITY * sinc(sinc_phase * phase_mul) * sinc(lanczos_phase) * phase_mul;
//sinc_sum += sinc_val;
filter->filter[i * sinc_size + j] = sinc_val;
}
//fprintf(stderr, "Sinc sum = %.3lf\n", (double)sinc_sum / FILTER_UNITY);
}
}
static bool gen_filter_sinc(struct scaler_ctx *ctx)
{
// Need to expand the filter when downsampling to get a proper low-pass effect.
const int sinc_size = 8 * (ctx->in_width > ctx->out_width ? next_pow2(ctx->in_width / ctx->out_width) : 1);
ctx->horiz.filter_len = sinc_size;
ctx->horiz.filter_stride = sinc_size;
ctx->vert.filter_len = sinc_size;
ctx->vert.filter_stride = sinc_size;
if (!allocate_filters(ctx))
return false;
int x_pos = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15) - (sinc_size << 15);
int x_step = (1 << 16) * ctx->in_width / ctx->out_width;
int y_pos = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15) - (sinc_size << 15);
int y_step = (1 << 16) * ctx->in_height / ctx->out_height;
double phase_mul_horiz = ctx->in_width > ctx->out_width ? (double)ctx->out_width / ctx->in_width : 1.0;
double phase_mul_vert = ctx->in_height > ctx->out_height ? (double)ctx->out_height / ctx->in_height : 1.0;
gen_filter_sinc_sub(&ctx->horiz, ctx->out_width, x_pos, x_step, phase_mul_horiz);
gen_filter_sinc_sub(&ctx->vert, ctx->out_height, y_pos, y_step, phase_mul_vert);
return true;
}
static bool validate_filter(struct scaler_ctx *ctx)
{
int max_w_pos = ctx->in_width - ctx->horiz.filter_len;
for (int i = 0; i < ctx->out_width; i++)
{
if (ctx->horiz.filter_pos[i] > max_w_pos || ctx->horiz.filter_pos[i] < 0)
{
fprintf(stderr, "Out X = %d => In X = %d\n", i, ctx->horiz.filter_pos[i]);
return false;
}
}
int max_h_pos = ctx->in_height - ctx->vert.filter_len;
for (int i = 0; i < ctx->out_height; i++)
{
if (ctx->vert.filter_pos[i] > max_h_pos || ctx->vert.filter_pos[i] < 0)
{
fprintf(stderr, "Out Y = %d => In Y = %d\n", i, ctx->vert.filter_pos[i]);
return false;
}
}
return true;
}
static void fixup_filter_sub(struct scaler_filter *filter, int out_len, int in_len)
{
int max_pos = in_len - filter->filter_len;
for (int i = 0; i < out_len; i++)
{
int postsample = filter->filter_pos[i] - max_pos;
int presample = -filter->filter_pos[i];
if (postsample > 0)
{
filter->filter_pos[i] -= postsample;
int16_t *base_filter = filter->filter + i * filter->filter_stride;
if (postsample > (int)filter->filter_len)
memset(base_filter, 0, filter->filter_len * sizeof(int16_t));
else
{
memmove(base_filter + postsample, base_filter, (filter->filter_len - postsample) * sizeof(int16_t));
memset(base_filter, 0, postsample * sizeof(int16_t));
}
}
if (presample > 0)
{
filter->filter_pos[i] += presample;
int16_t *base_filter = filter->filter + i * filter->filter_stride;
if (presample > (int)filter->filter_len)
memset(base_filter, 0, filter->filter_len * sizeof(int16_t));
else
{
memmove(base_filter, base_filter + presample, (filter->filter_len - presample) * sizeof(int16_t));
memset(base_filter + (filter->filter_len - presample), 0, presample * sizeof(int16_t));
}
}
}
}
// Makes sure that we never sample outside our rectangle.
static void fixup_filter(struct scaler_ctx *ctx)
{
fixup_filter_sub(&ctx->horiz, ctx->out_width, ctx->in_width);
fixup_filter_sub(&ctx->vert, ctx->out_height, ctx->in_height);
}
bool scaler_gen_filter(struct scaler_ctx *ctx)
{
bool ret = true;
switch (ctx->scaler_type)
{
case SCALER_TYPE_POINT:
ret = gen_filter_point(ctx);
break;
case SCALER_TYPE_BILINEAR:
ret = gen_filter_bilinear(ctx);
break;
case SCALER_TYPE_SINC:
ret = gen_filter_sinc(ctx);
break;
default:
return false;
}
if (!ret)
return false;
fixup_filter(ctx);
return validate_filter(ctx);
}

10
gfx/scaler/filter.h Normal file
View File

@ -0,0 +1,10 @@
#ifndef FILTER_H__
#define FILTER_H__
#include <stdbool.h>
#include "scaler.h"
bool scaler_gen_filter(struct scaler_ctx *ctx);
#endif

171
gfx/scaler/main.c Normal file
View File

@ -0,0 +1,171 @@
#include "scaler.h"
#include <Imlib2.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include <getopt.h>
#include <string.h>
static float g_horiz_scale = 1.0f;
static float g_vert_scale = 1.0f;
static enum scaler_type g_scaler_type = SCALER_TYPE_SINC;
static char *g_in_path;
static char *g_out_path;
static void print_help(void)
{
fprintf(stderr, "Usage: scale [...options...]\n");
fprintf(stderr, "\t-i/--input: Input file\n");
fprintf(stderr, "\t-o/--output: Output file\n");
fprintf(stderr, "\t-x/--xscale: Relative scale in X\n");
fprintf(stderr, "\t-y/--yscale: Relative scale in Y\n");
fprintf(stderr, "\t-s/--scale: Relative scale in both X/Y\n");
fprintf(stderr, "\t-t/--type: Filter type. Valid ones are:\n");
fprintf(stderr, "\t\tsinc, point, bilinear\n");
fprintf(stderr, "\t-h/--help: Prints this help\n");
}
static bool parse_args(int argc, char *argv[])
{
const struct option opts[] = {
{ "xscale", 1, NULL, 'x' },
{ "yscale", 1, NULL, 'y' },
{ "scale", 1, NULL, 's' },
{ "input", 1, NULL, 'i' },
{ "output", 1, NULL, 'o' },
{ "type", 1, NULL, 't' },
{ "help", 0, NULL, 'h' },
{ NULL, 0, NULL, 0 },
};
const char *optstring = "x:y:i:o:t:s:h";
for (;;)
{
int c = getopt_long(argc, argv, optstring, opts, NULL);
if (c == -1)
break;
switch (c)
{
case 'h':
print_help();
exit(EXIT_SUCCESS);
case 's':
g_horiz_scale = strtof(optarg, NULL);
g_vert_scale = g_horiz_scale;
break;
case 'x':
g_horiz_scale = strtof(optarg, NULL);
break;
case 'y':
g_vert_scale = strtof(optarg, NULL);
break;
case 'i':
g_in_path = strdup(optarg);
break;
case 'o':
g_out_path = strdup(optarg);
break;
case '?':
print_help();
return false;
case 't':
if (strcmp(optarg, "sinc") == 0)
g_scaler_type = SCALER_TYPE_SINC;
else if (strcmp(optarg, "bilinear") == 0)
g_scaler_type = SCALER_TYPE_BILINEAR;
else if (strcmp(optarg, "point") == 0)
g_scaler_type = SCALER_TYPE_POINT;
else
{
print_help();
return false;
}
break;
}
}
if (!g_in_path || !g_out_path)
{
print_help();
return false;
}
if (optind < argc)
{
print_help();
return false;
}
return true;
}
int main(int argc, char *argv[])
{
if (!parse_args(argc, argv))
return EXIT_FAILURE;
Imlib_Image img = imlib_load_image(g_in_path);
if (!img)
return EXIT_FAILURE;
imlib_context_set_image(img);
struct scaler_ctx ctx = {0};
ctx.in_width = imlib_image_get_width();
ctx.in_height = imlib_image_get_height();
ctx.out_width = (int)(imlib_image_get_width() * g_horiz_scale);
ctx.out_height = (int)(imlib_image_get_height() * g_vert_scale);
ctx.in_stride = imlib_image_get_width() * sizeof(uint32_t);
ctx.out_stride = (int)(imlib_image_get_width() * g_horiz_scale) * sizeof(uint32_t);
ctx.in_fmt = SCALER_FMT_ARGB8888;
ctx.out_fmt = SCALER_FMT_ARGB8888;
ctx.scaler_type = g_scaler_type;
assert(scaler_ctx_gen_filter(&ctx));
uint32_t *scale_buf = (uint32_t*)calloc(sizeof(uint32_t), ctx.out_width * ctx.out_height);
//struct timespec tv[2];
//clock_gettime(CLOCK_MONOTONIC, &tv[0]);
scaler_ctx_scale(&ctx, scale_buf, imlib_image_get_data_for_reading_only());
//clock_gettime(CLOCK_MONOTONIC, &tv[1]);
//double time_ms = (tv[1].tv_sec - tv[0].tv_sec) * 1000.0 + (tv[1].tv_nsec - tv[0].tv_nsec) / 1000000.0;
//double ns_per_pix = (1000000.0 * time_ms) / (ctx.out_width * ctx.out_height);
//printf("Time: %.3lf ms, %.3lf ns / pixel\n", time_ms, ns_per_pix);
Imlib_Image new_img = imlib_create_image_using_data(ctx.out_width, ctx.out_height,
scale_buf);
imlib_free_image();
imlib_context_set_image(new_img);
const char *fmt = strrchr(g_out_path, '.');
if (fmt)
fmt++;
else
fmt = "png";
imlib_image_set_format(fmt);
imlib_save_image(g_out_path);
imlib_free_image();
free(scale_buf);
free(g_in_path);
free(g_out_path);
scaler_ctx_gen_reset(&ctx);
}

131
gfx/scaler/pixconv.c Normal file
View File

@ -0,0 +1,131 @@
#include "pixconv.h"
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
void conv_0rgb1555_argb8888(void *output_, const void *input_,
int width, int height,
int out_stride, int in_stride)
{
const uint16_t *input = (const uint16_t*)input_;
uint32_t *output = (uint32_t*)output_;
for (int h = 0; h < height; h++, output += out_stride >> 2, input += in_stride >> 1)
{
for (int w = 0; w < width; w++)
{
uint32_t col = input[w];
uint32_t r = (col >> 10) & 0x1f;
uint32_t g = (col >> 5) & 0x1f;
uint32_t b = (col >> 0) & 0x1f;
r = (r << 3) | (r >> 2);
g = (g << 3) | (g >> 2);
b = (b << 3) | (b >> 2);
output[w] = (0xff << 24) | (r << 16) | (g << 8) | (b << 0);
}
}
}
void conv_0rgb1555_bgr24(void *output_, const void *input_,
int width, int height,
int out_stride, int in_stride)
{
const uint16_t *input = (const uint16_t*)input_;
uint8_t *output = (uint8_t*)output_;
for (int h = 0; h < height; h++, output += out_stride, input += in_stride >> 1)
{
uint8_t *out = output;
for (int w = 0; w < width; w++)
{
uint32_t col = input[w];
uint32_t b = (col >> 0) & 0x1f;
uint32_t g = (col >> 5) & 0x1f;
uint32_t r = (col >> 10) & 0x1f;
b = (b << 3) | (b >> 2);
g = (g << 3) | (g >> 2);
r = (r << 3) | (r >> 2);
*out++ = b;
*out++ = g;
*out++ = r;
}
}
}
void conv_bgr24_argb8888(void *output_, const void *input_,
int width, int height,
int out_stride, int in_stride)
{
const uint8_t *input = (const uint8_t*)input_;
uint32_t *output = (uint32_t*)output_;
for (int h = 0; h < height; h++, output += out_stride >> 2, input += in_stride)
{
const uint8_t *inp = input;
for (int w = 0; w < width; w++)
{
uint32_t b = *inp++;
uint32_t g = *inp++;
uint32_t r = *inp++;
output[w] = (0xff << 24) | (r << 16) | (g << 8) | (b << 0);
}
}
}
void conv_argb8888_0rgb1555(void *output_, const void *input_,
int width, int height,
int out_stride, int in_stride)
{
const uint32_t *input = (const uint32_t*)input_;
uint16_t *output = (uint16_t*)output_;
for (int h = 0; h < height; h++, output += out_stride >> 1, input += in_stride >> 2)
{
for (int w = 0; w < width; w++)
{
uint32_t col = input[w];
uint16_t r = (col >> 19) & 0x1f;
uint16_t g = (col >> 11) & 0x1f;
uint16_t b = (col >> 3) & 0x1f;
output[w] = (r << 10) | (g << 5) | (b << 0);
}
}
}
void conv_argb8888_bgr24(void *output_, const void *input_,
int width, int height,
int out_stride, int in_stride)
{
const uint32_t *input = (const uint32_t*)input_;
uint8_t *output = (uint8_t*)output_;
for (int h = 0; h < height; h++, output += out_stride, input += in_stride >> 2)
{
uint8_t *out = output;
for (int w = 0; w < width; w++)
{
uint32_t col = input[w];
*out++ = (uint8_t)(col >> 0);
*out++ = (uint8_t)(col >> 8);
*out++ = (uint8_t)(col >> 16);
}
}
}
void conv_copy(void *output_, const void *input_,
int width, int height,
int out_stride, int in_stride)
{
int copy_len = abs(out_stride);
if (abs(in_stride) < copy_len)
copy_len = abs(in_stride);
const uint8_t *input = (const uint8_t*)input_;
uint8_t *output = (uint8_t*)output_;
for (int h = 0; h < height; h++, output += out_stride, input += in_stride)
memcpy(output, input, copy_len);
}

29
gfx/scaler/pixconv.h Normal file
View File

@ -0,0 +1,29 @@
#ifndef PIXCONV_H__
#define PIXCONV_H__
void conv_0rgb1555_argb8888(void *output, const void *input,
int width, int height,
int out_stride, int in_stride);
void conv_bgr24_argb8888(void *output, const void *input,
int width, int height,
int out_stride, int in_stride);
void conv_argb8888_0rgb1555(void *output, const void *input,
int width, int height,
int out_stride, int in_stride);
void conv_argb8888_bgr24(void *output, const void *input,
int width, int height,
int out_stride, int in_stride);
void conv_0rgb1555_bgr24(void *output, const void *input,
int width, int height,
int out_stride, int in_stride);
void conv_copy(void *output, const void *input,
int width, int height,
int out_stride, int in_stride);
#endif

195
gfx/scaler/scaler.c Normal file
View File

@ -0,0 +1,195 @@
#include "scaler.h"
#include "scaler_int.h"
#include "filter.h"
#include "pixconv.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
// In case aligned allocs are needed later ...
void *scaler_alloc(size_t elem_size, size_t size)
{
return calloc(elem_size, size);
}
void scaler_free(void *ptr)
{
free(ptr);
}
static bool allocate_frames(struct scaler_ctx *ctx)
{
ctx->scaled.stride = ((ctx->out_width + 7) & ~7) * sizeof(uint64_t);
ctx->scaled.width = ctx->out_width;
ctx->scaled.height = ctx->in_height;
ctx->scaled.frame = (uint64_t*)scaler_alloc(sizeof(uint64_t), (ctx->scaled.stride * ctx->scaled.height) >> 3);
if (!ctx->scaled.frame)
return false;
if (ctx->in_fmt != SCALER_FMT_ARGB8888)
{
ctx->input.stride = ((ctx->in_width + 7) & ~7) * sizeof(uint32_t);
ctx->input.frame = (uint32_t*)scaler_alloc(sizeof(uint32_t), (ctx->input.stride * ctx->in_height) >> 2);
if (!ctx->input.frame)
return false;
}
if (ctx->out_fmt != SCALER_FMT_ARGB8888)
{
ctx->output.stride = ((ctx->out_width + 7) & ~7) * sizeof(uint32_t);
ctx->output.frame = (uint32_t*)scaler_alloc(sizeof(uint32_t), (ctx->output.stride * ctx->out_height) >> 2);
if (!ctx->output.frame)
return false;
}
return true;
}
static bool set_direct_pix_conv(struct scaler_ctx *ctx)
{
if (ctx->in_fmt == ctx->out_fmt)
ctx->direct_pixconv = conv_copy;
else if (ctx->in_fmt == SCALER_FMT_0RGB1555 && ctx->out_fmt == SCALER_FMT_ARGB8888)
ctx->direct_pixconv = conv_0rgb1555_argb8888;
else if (ctx->in_fmt == SCALER_FMT_BGR24 && ctx->out_fmt == SCALER_FMT_ARGB8888)
ctx->direct_pixconv = conv_bgr24_argb8888;
else if (ctx->in_fmt == SCALER_FMT_ARGB8888 && ctx->out_fmt == SCALER_FMT_0RGB1555)
ctx->direct_pixconv = conv_argb8888_0rgb1555;
else if (ctx->in_fmt == SCALER_FMT_ARGB8888 && ctx->out_fmt == SCALER_FMT_BGR24)
ctx->direct_pixconv = conv_argb8888_bgr24;
else if (ctx->in_fmt == SCALER_FMT_0RGB1555 && ctx->out_fmt == SCALER_FMT_BGR24)
ctx->direct_pixconv = conv_0rgb1555_bgr24;
else
return false;
return true;
}
static bool set_pix_conv(struct scaler_ctx *ctx)
{
switch (ctx->in_fmt)
{
case SCALER_FMT_ARGB8888:
// No need to convert :D
break;
case SCALER_FMT_0RGB1555:
ctx->in_pixconv = conv_0rgb1555_argb8888;
break;
case SCALER_FMT_BGR24:
ctx->in_pixconv = conv_bgr24_argb8888;
break;
default:
return false;
}
switch (ctx->out_fmt)
{
case SCALER_FMT_ARGB8888:
// No need to convert :D
break;
case SCALER_FMT_0RGB1555:
ctx->out_pixconv = conv_argb8888_0rgb1555;
break;
case SCALER_FMT_BGR24:
ctx->out_pixconv = conv_argb8888_bgr24;
break;
default:
return false;
}
return true;
}
bool scaler_ctx_gen_filter(struct scaler_ctx *ctx)
{
scaler_ctx_gen_reset(ctx);
if (ctx->in_width == ctx->out_width && ctx->in_height == ctx->out_height)
ctx->unscaled = true; // Only pixel format conversion ...
else
{
ctx->scaler_horiz = scaler_argb8888_horiz;
ctx->scaler_vert = scaler_argb8888_vert;
ctx->unscaled = false;
}
if (!allocate_frames(ctx))
return false;
if (ctx->unscaled)
{
if (!set_direct_pix_conv(ctx))
return false;
}
else
{
if (!set_pix_conv(ctx))
return false;
}
if (!ctx->unscaled && !scaler_gen_filter(ctx))
return false;
return true;
}
void scaler_ctx_gen_reset(struct scaler_ctx *ctx)
{
scaler_free(ctx->horiz.filter);
scaler_free(ctx->horiz.filter_pos);
scaler_free(ctx->vert.filter);
scaler_free(ctx->vert.filter_pos);
scaler_free(ctx->scaled.frame);
scaler_free(ctx->input.frame);
scaler_free(ctx->output.frame);
memset(&ctx->horiz, 0, sizeof(ctx->horiz));
memset(&ctx->vert, 0, sizeof(ctx->vert));
memset(&ctx->scaled, 0, sizeof(ctx->scaled));
memset(&ctx->input, 0, sizeof(ctx->input));
memset(&ctx->output, 0, sizeof(ctx->output));
}
void scaler_ctx_scale(const struct scaler_ctx *ctx,
void *output, const void *input)
{
if (ctx->unscaled)
{
ctx->direct_pixconv(output, input,
ctx->out_width, ctx->out_height,
ctx->out_stride, ctx->in_stride);
}
else
{
if (ctx->in_fmt != SCALER_FMT_ARGB8888)
{
ctx->in_pixconv(ctx->input.frame, input,
ctx->in_width, ctx->in_height,
ctx->input.stride, ctx->in_stride);
ctx->scaler_horiz(ctx, ctx->input.frame, ctx->input.stride);
}
else
ctx->scaler_horiz(ctx, input, ctx->in_stride);
if (ctx->out_fmt != SCALER_FMT_ARGB8888)
{
ctx->scaler_vert(ctx, ctx->output.frame, ctx->output.stride);
ctx->out_pixconv(output, ctx->output.frame,
ctx->out_width, ctx->out_height,
ctx->out_stride, ctx->output.stride);
}
else
ctx->scaler_vert(ctx, output, ctx->out_stride);
}
}

90
gfx/scaler/scaler.h Normal file
View File

@ -0,0 +1,90 @@
#ifndef SCALER_H__
#define SCALER_H__
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
#define FILTER_UNITY (1 << 14)
enum scaler_pix_fmt
{
SCALER_FMT_ARGB8888 = 0,
SCALER_FMT_0RGB1555,
SCALER_FMT_BGR24
};
enum scaler_type
{
SCALER_TYPE_UNKNOWN = 0,
SCALER_TYPE_POINT,
SCALER_TYPE_BILINEAR,
SCALER_TYPE_SINC
};
struct scaler_filter
{
int16_t *filter;
size_t filter_len;
size_t filter_stride;
int *filter_pos;
};
struct scaler_ctx
{
int in_width;
int in_height;
int in_stride;
int out_width;
int out_height;
int out_stride;
enum scaler_pix_fmt in_fmt;
enum scaler_pix_fmt out_fmt;
enum scaler_type scaler_type;
void (*scaler_horiz)(const struct scaler_ctx*,
const void*, int);
void (*scaler_vert)(const struct scaler_ctx*,
void*, int);
void (*in_pixconv)(void*, const void*, int, int, int, int);
void (*out_pixconv)(void*, const void*, int, int, int, int);
void (*direct_pixconv)(void*, const void*, int, int, int, int);
bool unscaled;
struct scaler_filter horiz, vert;
struct
{
uint32_t *frame;
int stride;
} input;
struct
{
uint64_t *frame;
int width;
int height;
int stride;
} scaled;
struct
{
uint32_t *frame;
int stride;
} output;
};
bool scaler_ctx_gen_filter(struct scaler_ctx *ctx);
void scaler_ctx_gen_reset(struct scaler_ctx *ctx);
void scaler_ctx_scale(const struct scaler_ctx *ctx,
void *output, const void *input);
void *scaler_alloc(size_t elem_size, size_t size);
void scaler_free(void *ptr);
#endif

214
gfx/scaler/scaler_int.c Normal file
View File

@ -0,0 +1,214 @@
#include "scaler_int.h"
#if defined(__SSE2__)
#include <emmintrin.h>
#endif
static inline uint64_t build_argb64(uint16_t a, uint16_t r, uint16_t g, uint16_t b)
{
return ((uint64_t)a << 48) | ((uint64_t)r << 32) | ((uint64_t)g << 16) | ((uint64_t)b << 0);
}
static inline uint8_t clamp_8bit(int16_t col)
{
if (col > 255)
return 255;
else if (col < 0)
return 0;
else
return (uint8_t)col;
}
// ARGB8888 scaler is split in two:
//
// First, horizontal scaler is applied.
// Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7 to left to occupy 15 bits.
// The sign bit is kept empty as we have to do signed multiplication for the filter.
// A mulhi [(a * b) >> 16] is applied which loses some precision, but is very efficient for SIMD.
// It is accurate enough for 8-bit purposes.
//
// The fixed point 1.0 for filter is (1 << 14). After horizontal scale, the output is kept
// with 16-bit channels, and will now have 13 bits of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
//
// Vertical scaler takes the 13 bit channels, and performs the same mulhi steps.
// Another 2 bits of precision is lost, which ends up as 11 bits.
// Scaling is now complete. Channels are shifted right by 3, and saturated into 8-bit values.
//
// The C version of scalers perform the exact same operations as the SIMD code for testing purposes.
#if defined(__SSE2__)
void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
{
const uint64_t *input = ctx->scaled.frame;
uint32_t *output = (uint32_t*)output_;
const int16_t *filter_vert = ctx->vert.filter;
for (int h = 0; h < ctx->out_height; h++, filter_vert += ctx->vert.filter_stride, output += stride >> 2)
{
const uint64_t *input_base = input + ctx->vert.filter_pos[h] * (ctx->scaled.stride >> 3);
for (int w = 0; w < ctx->out_width; w++)
{
__m128i res = _mm_setzero_si128();
const uint64_t *input_base_y = input_base + w;
size_t y;
for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2, input_base_y += (ctx->scaled.stride >> 2))
{
__m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll);
__m128i col = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]);
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
}
for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
{
__m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll);
__m128i col = _mm_set_epi64x(0, input_base_y[0]);
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
}
res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
res = _mm_srai_epi16(res, (7 - 2 - 2));
__m128i final = _mm_packus_epi16(res, res);
output[w] = _mm_cvtsi128_si32(final);
}
}
}
#else
void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
{
const uint64_t *input = ctx->scaled.frame;
uint32_t *output = output_;
const int16_t *filter_vert = ctx->vert.filter;
for (int h = 0; h < ctx->out_height; h++, filter_vert += ctx->vert.filter_stride, output += stride >> 2)
{
const uint64_t *input_base = input + ctx->vert.filter_pos[h] * (ctx->scaled.stride >> 3);
for (int w = 0; w < ctx->out_width; w++)
{
int16_t res_a = 0;
int16_t res_r = 0;
int16_t res_g = 0;
int16_t res_b = 0;
const uint64_t *input_base_y = input_base + w;
for (size_t y = 0; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
{
uint64_t col = *input_base_y;
int16_t a = (col >> 48) & 0xffff;
int16_t r = (col >> 32) & 0xffff;
int16_t g = (col >> 16) & 0xffff;
int16_t b = (col >> 0) & 0xffff;
int16_t coeff = filter_vert[y];
res_a += (a * coeff) >> 16;
res_r += (r * coeff) >> 16;
res_g += (g * coeff) >> 16;
res_b += (b * coeff) >> 16;
}
res_a >>= (7 - 2 - 2);
res_r >>= (7 - 2 - 2);
res_g >>= (7 - 2 - 2);
res_b >>= (7 - 2 - 2);
output[w] = (clamp_8bit(res_a) << 24) | (clamp_8bit(res_r) << 16) | (clamp_8bit(res_g) << 8) | (clamp_8bit(res_b) << 0);
}
}
}
#endif
#if defined(__SSE2__)
void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
{
const uint32_t *input = (const uint32_t*)input_;
uint64_t *output = ctx->scaled.frame;
for (int h = 0; h < ctx->scaled.height; h++, input += stride >> 2, output += ctx->scaled.stride >> 3)
{
const int16_t *filter_horiz = ctx->horiz.filter;
for (int w = 0; w < ctx->scaled.width; w++, filter_horiz += ctx->horiz.filter_stride)
{
__m128i res = _mm_setzero_si128();
const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
size_t x;
for (x = 0; (x + 1) < ctx->horiz.filter_len; x += 2)
{
__m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll);
__m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0,
((uint64_t)input_base_x[x + 1] << 32) | input_base_x[x + 0]), _mm_setzero_si128());
col = _mm_slli_epi16(col, 7);
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
}
for (; x < ctx->horiz.filter_len; x++)
{
__m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll);
__m128i col = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128());
col = _mm_slli_epi16(col, 7);
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
}
res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
output[w] = _mm_cvtsi128_si64(res);
}
}
}
#else
void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
{
const uint32_t *input = input_;
uint64_t *output = ctx->scaled.frame;
for (int h = 0; h < ctx->scaled.height; h++, input += stride >> 2, output += ctx->scaled.stride >> 3)
{
const int16_t *filter_horiz = ctx->horiz.filter;
for (int w = 0; w < ctx->scaled.width; w++, filter_horiz += ctx->horiz.filter_stride)
{
const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
int16_t res_a = 0;
int16_t res_r = 0;
int16_t res_g = 0;
int16_t res_b = 0;
for (size_t x = 0; x < ctx->horiz.filter_len; x++)
{
uint32_t col = input_base_x[x];
int16_t a = (col >> (24 - 7)) & (0xff << 7);
int16_t r = (col >> (16 - 7)) & (0xff << 7);
int16_t g = (col >> ( 8 - 7)) & (0xff << 7);
int16_t b = (col << ( 0 + 7)) & (0xff << 7);
int16_t coeff = filter_horiz[x];
res_a += (a * coeff) >> 16;
res_r += (r * coeff) >> 16;
res_g += (g * coeff) >> 16;
res_b += (b * coeff) >> 16;
}
output[w] = build_argb64(res_a, res_r, res_g, res_b);
}
}
}
#endif

10
gfx/scaler/scaler_int.h Normal file
View File

@ -0,0 +1,10 @@
#ifndef SCALER_INT_H__
#define SCALER_INT_H__
#include "scaler.h"
void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output, int stride);
void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input, int stride);
#endif

View File

@ -19,6 +19,7 @@
#include <string.h>
#include "../general.h"
#include "../input/rarch_sdl_input.h"
#include "scaler/scaler.h"
#include "gfx_common.h"
#include "gfx_context.h"
@ -53,6 +54,10 @@ typedef struct sdl_video
uint8_t font_g;
uint8_t font_b;
#endif
struct scaler_ctx scaler;
unsigned last_width;
unsigned last_height;
} sdl_video_t;
static void sdl_gfx_free(void *data)
@ -71,6 +76,8 @@ static void sdl_gfx_free(void *data)
font_renderer_free(vid->font);
#endif
scaler_ctx_gen_reset(&vid->scaler);
free(vid);
}
@ -268,23 +275,18 @@ static void *sdl_gfx_init(const video_info_t *video, const input_driver_t **inpu
if (!video->fullscreen)
RARCH_LOG("Creating window @ %ux%u\n", video->width, video->height);
vid->render32 = video->rgb32 && !g_settings.video.force_16bit;
vid->render32 = !g_settings.video.force_16bit;
vid->screen = SDL_SetVideoMode(video->width, video->height, vid->render32 ? 32 : 15, SDL_HWSURFACE | SDL_HWACCEL | SDL_DOUBLEBUF | (video->fullscreen ? SDL_FULLSCREEN : 0));
if (!vid->screen && !g_settings.video.force_16bit && !video->rgb32)
{
vid->upsample = true;
vid->screen = SDL_SetVideoMode(video->width, video->height, 32, SDL_HWSURFACE | SDL_HWACCEL | SDL_DOUBLEBUF | (video->fullscreen ? SDL_FULLSCREEN : 0));
RARCH_WARN("SDL: 15-bit colors failed, attempting 32-bit colors.\n");
vid->render32 = true;
}
if (!vid->screen)
{
RARCH_ERR("Failed to init SDL surface: %s\n", SDL_GetError());
goto error;
}
if (!video->rgb32 && vid->render32)
vid->upsample = true;
SDL_ShowCursor(SDL_DISABLE);
#ifdef HAVE_X11
@ -358,6 +360,10 @@ static void *sdl_gfx_init(const video_info_t *video, const input_driver_t **inpu
vid->convert_32_func = convert_32bit_32bit_shift;
}
vid->scaler.scaler_type = video->smooth ? SCALER_TYPE_BILINEAR : SCALER_TYPE_POINT;
vid->scaler.in_fmt = vid->render32 ? SCALER_FMT_ARGB8888 : SCALER_FMT_0RGB1555;
vid->scaler.out_fmt = vid->scaler.in_fmt;
return vid;
error:
@ -375,13 +381,20 @@ static inline uint16_t conv_pixel_32_15(uint32_t pix, const SDL_PixelFormat *fmt
static inline uint32_t conv_pixel_15_32(uint16_t pix, const SDL_PixelFormat *fmt)
{
uint32_t r = ((pix >> 10) & 0x1f) << (fmt->Rshift + 3);
uint32_t g = ((pix >> 5) & 0x1f) << (fmt->Gshift + 3);
uint32_t b = ((pix >> 0) & 0x1f) << (fmt->Bshift + 3);
return r | g | b;
uint32_t r = (pix >> 10) & 0x1f;
uint32_t g = (pix >> 5) & 0x1f;
uint32_t b = (pix >> 0) & 0x1f;
r = (r << 3) | (r >> 2);
g = (g << 3) | (g >> 2);
b = (b << 3) | (b >> 2);
return (r << fmt->Rshift) | (g << fmt->Gshift) | (b << fmt->Bshift);
}
static void convert_32bit_15bit(uint16_t *out, unsigned outpitch, const uint32_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt)
static void convert_32bit_15bit(uint16_t *out, unsigned outpitch,
const uint32_t *input, unsigned width, unsigned height,
unsigned pitch, const SDL_PixelFormat *fmt)
{
for (unsigned y = 0; y < height; y++)
{
@ -393,7 +406,9 @@ static void convert_32bit_15bit(uint16_t *out, unsigned outpitch, const uint32_t
}
}
static void convert_15bit_32bit(uint32_t *out, unsigned outpitch, const uint16_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt)
static void convert_15bit_32bit(uint32_t *out, unsigned outpitch,
const uint16_t *input, unsigned width, unsigned height,
unsigned pitch, const SDL_PixelFormat *fmt)
{
for (unsigned y = 0; y < height; y++)
{
@ -405,7 +420,9 @@ static void convert_15bit_32bit(uint32_t *out, unsigned outpitch, const uint16_t
}
}
static void convert_15bit_15bit_direct(uint16_t *out, unsigned outpitch, const uint16_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt)
static void convert_15bit_15bit_direct(uint16_t *out, unsigned outpitch,
const uint16_t *input, unsigned width, unsigned height,
unsigned pitch, const SDL_PixelFormat *fmt)
{
for (unsigned y = 0; y < height; y++)
{
@ -416,7 +433,9 @@ static void convert_15bit_15bit_direct(uint16_t *out, unsigned outpitch, const u
(void)fmt;
}
static void convert_32bit_32bit_direct(uint32_t *out, unsigned outpitch, const uint32_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt)
static void convert_32bit_32bit_direct(uint32_t *out, unsigned outpitch,
const uint32_t *input, unsigned width, unsigned height,
unsigned pitch, const SDL_PixelFormat *fmt)
{
for (unsigned y = 0; y < height; y++)
{
@ -427,12 +446,15 @@ static void convert_32bit_32bit_direct(uint32_t *out, unsigned outpitch, const u
(void)fmt;
}
static void convert_15bit_15bit_shift(uint16_t *out, unsigned outpitch, const uint16_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt)
static void convert_15bit_15bit_shift(uint16_t *out, unsigned outpitch,
const uint16_t *input, unsigned width, unsigned height,
unsigned pitch, const SDL_PixelFormat *fmt)
{
for (unsigned y = 0; y < height; y++)
{
uint16_t *dest = out + ((y * outpitch) >> 1);
uint16_t *dest = out + ((y * outpitch) >> 1);
const uint16_t *src = input + ((y * pitch) >> 1);
for (unsigned x = 0; x < width; x++)
{
uint16_t color = src[x];
@ -444,12 +466,15 @@ static void convert_15bit_15bit_shift(uint16_t *out, unsigned outpitch, const ui
}
}
static void convert_32bit_32bit_shift(uint32_t *out, unsigned outpitch, const uint32_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt)
static void convert_32bit_32bit_shift(uint32_t *out, unsigned outpitch,
const uint32_t *input, unsigned width, unsigned height,
unsigned pitch, const SDL_PixelFormat *fmt)
{
for (unsigned y = 0; y < height; y++)
{
uint32_t *dest = out + ((y * outpitch) >> 2);
uint32_t *dest = out + ((y * outpitch) >> 2);
const uint32_t *src = input + ((y * pitch) >> 2);
for (unsigned x = 0; x < width; x++)
{
uint32_t color = src[x];
@ -488,43 +513,51 @@ static bool sdl_gfx_frame(void *data, const void *frame, unsigned width, unsigne
if (SDL_MUSTLOCK(vid->buffer))
SDL_LockSurface(vid->buffer);
// :(
// 15-bit -> 32-bit (Sometimes 15-bit won't work on "modern" OSes :\)
// 15-bit -> 32-bit.
if (vid->upsample)
convert_15bit_32bit((uint32_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint16_t*)frame, width, height, pitch, vid->screen->format);
// 15-bit -> 15-bit
else if (!vid->rgb32)
vid->convert_15_func((uint16_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint16_t*)frame, width, height, pitch, vid->screen->format);
// 32-bit -> 15-bit
else if (vid->rgb32 && g_settings.video.force_16bit)
else if (vid->rgb32 && !vid->render32)
convert_32bit_15bit((uint16_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint32_t*)frame, width, height, pitch, vid->screen->format);
// 32-bit -> 32-bit
else
vid->convert_32_func((uint32_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint32_t*)frame, width, height, pitch, vid->screen->format);
if (width != vid->last_width || height != vid->last_height)
{
vid->scaler.in_width = width;
vid->scaler.in_height = height;
vid->scaler.in_stride = vid->buffer->pitch;
vid->scaler.out_width = vid->screen->w;
vid->scaler.out_height = vid->screen->h;
vid->scaler.out_stride = vid->screen->pitch;
scaler_ctx_gen_filter(&vid->scaler);
vid->last_width = width;
vid->last_height = height;
}
if (SDL_MUSTLOCK(vid->screen))
SDL_LockSurface(vid->screen);
scaler_ctx_scale(&vid->scaler, vid->screen->pixels, vid->buffer->pixels);
if (SDL_MUSTLOCK(vid->buffer))
SDL_UnlockSurface(vid->buffer);
SDL_Rect src = {0};
src.x = 0;
src.y = 0;
src.w = width;
src.h = height;
SDL_Rect dest = {0};
dest.x = 0;
dest.y = 0;
dest.w = vid->screen->w;
dest.h = vid->screen->h;
SDL_SoftStretch(vid->buffer, &src, vid->screen, &dest);
if (SDL_MUSTLOCK(vid->screen))
SDL_UnlockSurface(vid->screen);
if (msg)
{
if ((!vid->rgb32 || g_settings.video.force_16bit) && !vid->upsample)
sdl_render_msg_15(vid, vid->screen, msg, vid->screen->w, vid->screen->h, vid->screen->format);
else
if (vid->render32)
sdl_render_msg_32(vid, vid->screen, msg, vid->screen->w, vid->screen->h, vid->screen->format);
else
sdl_render_msg_15(vid, vid->screen, msg, vid->screen->w, vid->screen->h, vid->screen->format);
}
char buf[128];