Add custom scaling routines.

Implemented for point, bilinear, lanczos. Partly optimized for SSE2.
2025-01-22 09:15:02 +00:00 · 2012-09-02 14:30:46 +02:00 · 2012-09-02 14:30:46 +02:00 · 19fa31f17d
commit 19fa31f17d
parent 22e43d4d84
12 changed files with 1176 additions and 41 deletions
--- a/1
+++ b/1
@ -126,6 +126,7 @@ endif

 ifeq ($(HAVE_SDL), 1)
   OBJ += gfx/sdl_gfx.o gfx/context/sdl_ctx.o input/sdl_input.o audio/sdl_audio.o fifo_buffer.o
+   OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
   DEFINES += $(SDL_CFLAGS) $(BSD_LOCAL_INC)
   LIBS += $(SDL_LIBS)

--- a/Makefile.win
+++ b/Makefile.win
@ -62,6 +62,7 @@ endif

 ifeq ($(HAVE_SDL), 1)
   OBJ += gfx/sdl_gfx.o gfx/gl.o gfx/math/matrix.o gfx/fonts/freetype.o gfx/context/sdl_ctx.o input/sdl_input.o audio/sdl_audio.o fifo_buffer.o
+   OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
   LIBS += -lSDL
   DEFINES += -ISDL -DHAVE_SDL
 endif
--- a/gfx/scaler/filter.c
+++ b/gfx/scaler/filter.c
@ -0,0 +1,250 @@
+#include "filter.h"
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+static bool allocate_filters(struct scaler_ctx *ctx)
+{
+   ctx->horiz.filter     = (int16_t*)scaler_alloc(sizeof(int16_t), ctx->horiz.filter_stride * ctx->out_width);
+   ctx->horiz.filter_pos = (int*)scaler_alloc(sizeof(int), ctx->out_width);
+
+   ctx->vert.filter      = (int16_t*)scaler_alloc(sizeof(int16_t), ctx->vert.filter_stride * ctx->out_height);
+   ctx->vert.filter_pos  = (int*)scaler_alloc(sizeof(int), ctx->out_height);
+
+   return ctx->horiz.filter && ctx->vert.filter;
+}
+
+static void gen_filter_point_sub(struct scaler_filter *filter, int len, int pos, int step)
+{
+   for (int i = 0; i < len; i++, pos += step)
+   {
+      filter->filter_pos[i] = pos >> 16;
+      filter->filter[i]     = FILTER_UNITY;
+   }
+}
+
+static bool gen_filter_point(struct scaler_ctx *ctx)
+{
+   ctx->horiz.filter_len    = 1;
+   ctx->horiz.filter_stride = 1;
+   ctx->vert.filter_len     = 1;
+   ctx->vert.filter_stride  = 1;
+
+   if (!allocate_filters(ctx))
+      return false;
+
+   int x_pos  = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15);
+   int x_step = (1 << 16) * ctx->in_width / ctx->out_width;
+   int y_pos  = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
+   int y_step = (1 << 16) * ctx->in_height / ctx->out_height;
+
+   gen_filter_point_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
+   gen_filter_point_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
+
+   return true;
+}
+
+static void gen_filter_bilinear_sub(struct scaler_filter *filter, int len, int pos, int step)
+{
+   for (int i = 0; i < len; i++, pos += step)
+   {
+      filter->filter_pos[i]     = pos >> 16;
+      filter->filter[i * 2 + 1] = (pos & 0xffff) >> 2;
+      filter->filter[i * 2 + 0] = FILTER_UNITY - filter->filter[i * 2 + 1];
+   }
+}
+
+static bool gen_filter_bilinear(struct scaler_ctx *ctx)
+{
+   ctx->horiz.filter_len    = 2;
+   ctx->horiz.filter_stride = 2;
+   ctx->vert.filter_len     = 2;
+   ctx->vert.filter_stride  = 2;
+
+   if (!allocate_filters(ctx))
+      return false;
+
+   int x_pos  = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15);
+   int x_step = (1 << 16) * ctx->in_width / ctx->out_width;
+   int y_pos  = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
+   int y_step = (1 << 16) * ctx->in_height / ctx->out_height;
+
+   gen_filter_bilinear_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
+   gen_filter_bilinear_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
+
+   return true;
+}
+
+static inline double sinc(double phase)
+{
+   if (fabs(phase) < 0.0001)
+      return 1.0;
+   else
+      return sin(phase) / phase;
+}
+
+static inline unsigned next_pow2(unsigned v)
+{
+   v--;
+   v |= v >> 1;
+   v |= v >> 2;
+   v |= v >> 4;
+   v |= v >> 8;
+   v |= v >> 16;
+   v++;
+
+   return v;
+}
+
+static void gen_filter_sinc_sub(struct scaler_filter *filter, int len, int pos, int step, double phase_mul)
+{
+   const int sinc_size = filter->filter_len;
+
+   for (int i = 0; i < len; i++, pos += step)
+   {
+      filter->filter_pos[i] = pos >> 16;
+
+      //int16_t sinc_sum = 0;
+      for (int j = 0; j < sinc_size; j++)
+      {
+         double sinc_phase    = M_PI * ((double)((sinc_size << 15) + (pos & 0xffff)) / 0x10000 - j);
+         double lanczos_phase = sinc_phase / ((sinc_size >> 1));
+         int16_t sinc_val     = FILTER_UNITY * sinc(sinc_phase * phase_mul) * sinc(lanczos_phase) * phase_mul;
+         //sinc_sum += sinc_val;
+
+         filter->filter[i * sinc_size + j] = sinc_val;
+      }
+      //fprintf(stderr, "Sinc sum = %.3lf\n", (double)sinc_sum / FILTER_UNITY);
+   }
+}
+
+static bool gen_filter_sinc(struct scaler_ctx *ctx)
+{
+   // Need to expand the filter when downsampling to get a proper low-pass effect.
+   const int sinc_size      = 8 * (ctx->in_width > ctx->out_width ? next_pow2(ctx->in_width / ctx->out_width) : 1);
+   ctx->horiz.filter_len    = sinc_size;
+   ctx->horiz.filter_stride = sinc_size;
+   ctx->vert.filter_len     = sinc_size;
+   ctx->vert.filter_stride  = sinc_size;
+
+   if (!allocate_filters(ctx))
+      return false;
+
+   int x_pos  = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15) - (sinc_size << 15);
+   int x_step = (1 << 16) * ctx->in_width / ctx->out_width;
+   int y_pos  = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15) - (sinc_size << 15);
+   int y_step = (1 << 16) * ctx->in_height / ctx->out_height;
+
+   double phase_mul_horiz = ctx->in_width  > ctx->out_width  ? (double)ctx->out_width  / ctx->in_width  : 1.0;
+   double phase_mul_vert  = ctx->in_height > ctx->out_height ? (double)ctx->out_height / ctx->in_height : 1.0;
+
+   gen_filter_sinc_sub(&ctx->horiz, ctx->out_width, x_pos, x_step, phase_mul_horiz);
+   gen_filter_sinc_sub(&ctx->vert, ctx->out_height, y_pos, y_step, phase_mul_vert);
+
+   return true;
+}
+
+
+static bool validate_filter(struct scaler_ctx *ctx)
+{
+   int max_w_pos = ctx->in_width - ctx->horiz.filter_len;
+   for (int i = 0; i < ctx->out_width; i++)
+   {
+      if (ctx->horiz.filter_pos[i] > max_w_pos || ctx->horiz.filter_pos[i] < 0)
+      {
+         fprintf(stderr, "Out X = %d => In X = %d\n", i, ctx->horiz.filter_pos[i]); 
+         return false;
+      }
+   }
+
+   int max_h_pos = ctx->in_height - ctx->vert.filter_len;
+   for (int i = 0; i < ctx->out_height; i++)
+   {
+      if (ctx->vert.filter_pos[i] > max_h_pos || ctx->vert.filter_pos[i] < 0)
+      {
+         fprintf(stderr, "Out Y = %d => In Y = %d\n", i, ctx->vert.filter_pos[i]); 
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static void fixup_filter_sub(struct scaler_filter *filter, int out_len, int in_len)
+{
+   int max_pos = in_len - filter->filter_len;
+
+   for (int i = 0; i < out_len; i++)
+   {
+      int postsample = filter->filter_pos[i] - max_pos;
+      int presample  = -filter->filter_pos[i];
+
+      if (postsample > 0)
+      {
+         filter->filter_pos[i] -= postsample;
+
+         int16_t *base_filter = filter->filter + i * filter->filter_stride;
+
+         if (postsample > (int)filter->filter_len)
+            memset(base_filter, 0, filter->filter_len * sizeof(int16_t));
+         else
+         {
+            memmove(base_filter + postsample, base_filter, (filter->filter_len - postsample) * sizeof(int16_t));
+            memset(base_filter, 0, postsample * sizeof(int16_t));
+         }
+      }
+
+      if (presample > 0)
+      {
+         filter->filter_pos[i] += presample;
+         int16_t *base_filter = filter->filter + i * filter->filter_stride;
+
+         if (presample > (int)filter->filter_len)
+            memset(base_filter, 0, filter->filter_len * sizeof(int16_t));
+         else
+         {
+            memmove(base_filter, base_filter + presample, (filter->filter_len - presample) * sizeof(int16_t));
+            memset(base_filter + (filter->filter_len - presample), 0, presample * sizeof(int16_t));
+         }
+      }
+   }
+}
+
+// Makes sure that we never sample outside our rectangle.
+static void fixup_filter(struct scaler_ctx *ctx)
+{
+   fixup_filter_sub(&ctx->horiz, ctx->out_width, ctx->in_width);
+   fixup_filter_sub(&ctx->vert, ctx->out_height, ctx->in_height);
+}
+
+
+bool scaler_gen_filter(struct scaler_ctx *ctx)
+{
+   bool ret = true;
+
+   switch (ctx->scaler_type)
+   {
+      case SCALER_TYPE_POINT:
+         ret = gen_filter_point(ctx);
+         break;
+
+      case SCALER_TYPE_BILINEAR:
+         ret = gen_filter_bilinear(ctx);
+         break;
+
+      case SCALER_TYPE_SINC:
+         ret = gen_filter_sinc(ctx);
+         break;
+
+      default:
+         return false;
+   }
+
+   if (!ret)
+      return false;
+
+   fixup_filter(ctx);
+
+   return validate_filter(ctx);
+}
+
--- a/gfx/scaler/filter.h
+++ b/gfx/scaler/filter.h
@ -0,0 +1,10 @@
+#ifndef FILTER_H__
+#define FILTER_H__
+
+#include <stdbool.h>
+#include "scaler.h"
+
+bool scaler_gen_filter(struct scaler_ctx *ctx);
+
+#endif
+
--- a/gfx/scaler/main.c
+++ b/gfx/scaler/main.c
@ -0,0 +1,171 @@
+#include "scaler.h"
+#include <Imlib2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <getopt.h>
+#include <string.h>
+
+static float g_horiz_scale = 1.0f;
+static float g_vert_scale  = 1.0f;
+
+static enum scaler_type g_scaler_type  = SCALER_TYPE_SINC;
+
+static char *g_in_path;
+static char *g_out_path;
+
+static void print_help(void)
+{
+   fprintf(stderr, "Usage: scale [...options...]\n");
+   fprintf(stderr, "\t-i/--input: Input file\n");
+   fprintf(stderr, "\t-o/--output: Output file\n");
+   fprintf(stderr, "\t-x/--xscale: Relative scale in X\n");
+   fprintf(stderr, "\t-y/--yscale: Relative scale in Y\n");
+   fprintf(stderr, "\t-s/--scale: Relative scale in both X/Y\n");
+   fprintf(stderr, "\t-t/--type: Filter type. Valid ones are:\n");
+   fprintf(stderr, "\t\tsinc, point, bilinear\n");
+   fprintf(stderr, "\t-h/--help: Prints this help\n");
+}
+
+static bool parse_args(int argc, char *argv[])
+{
+   const struct option opts[] = {
+      { "xscale", 1, NULL, 'x' },
+      { "yscale", 1, NULL, 'y' },
+      { "scale", 1, NULL, 's' },
+      { "input", 1, NULL, 'i' },
+      { "output", 1, NULL, 'o' },
+      { "type", 1, NULL, 't' },
+      { "help", 0, NULL, 'h' },
+      { NULL, 0, NULL, 0 },
+   };
+
+   const char *optstring = "x:y:i:o:t:s:h";
+
+   for (;;)
+   {
+      int c = getopt_long(argc, argv, optstring, opts, NULL);
+      if (c == -1)
+         break;
+
+      switch (c)
+      {
+         case 'h':
+            print_help();
+            exit(EXIT_SUCCESS);
+
+         case 's':
+            g_horiz_scale = strtof(optarg, NULL);
+            g_vert_scale  = g_horiz_scale;
+            break;
+
+         case 'x':
+            g_horiz_scale = strtof(optarg, NULL);
+            break;
+
+         case 'y':
+            g_vert_scale = strtof(optarg, NULL);
+            break;
+
+         case 'i':
+            g_in_path = strdup(optarg);
+            break;
+
+         case 'o':
+            g_out_path = strdup(optarg);
+            break;
+
+         case '?':
+            print_help();
+            return false;
+
+         case 't':
+            if (strcmp(optarg, "sinc") == 0)
+               g_scaler_type = SCALER_TYPE_SINC;
+            else if (strcmp(optarg, "bilinear") == 0)
+               g_scaler_type = SCALER_TYPE_BILINEAR;
+            else if (strcmp(optarg, "point") == 0)
+               g_scaler_type = SCALER_TYPE_POINT;
+            else
+            {
+               print_help();
+               return false;
+            }
+            break;
+      }
+   }
+
+   if (!g_in_path || !g_out_path)
+   {
+      print_help();
+      return false;
+   }
+
+   if (optind < argc)
+   {
+      print_help();
+      return false;
+   }
+   
+   return true;
+}
+
+int main(int argc, char *argv[])
+{
+   if (!parse_args(argc, argv))
+      return EXIT_FAILURE;
+
+   Imlib_Image img = imlib_load_image(g_in_path);
+   if (!img)
+      return EXIT_FAILURE;
+
+   imlib_context_set_image(img);
+
+   struct scaler_ctx ctx = {0};
+   ctx.in_width    = imlib_image_get_width();
+   ctx.in_height   = imlib_image_get_height();
+   ctx.out_width   = (int)(imlib_image_get_width() * g_horiz_scale);
+   ctx.out_height  = (int)(imlib_image_get_height() * g_vert_scale);
+   ctx.in_stride   = imlib_image_get_width() * sizeof(uint32_t);
+   ctx.out_stride  = (int)(imlib_image_get_width() * g_horiz_scale) * sizeof(uint32_t);
+   ctx.in_fmt      = SCALER_FMT_ARGB8888;
+   ctx.out_fmt     = SCALER_FMT_ARGB8888;
+   ctx.scaler_type = g_scaler_type;
+
+   assert(scaler_ctx_gen_filter(&ctx));
+
+   uint32_t *scale_buf = (uint32_t*)calloc(sizeof(uint32_t), ctx.out_width * ctx.out_height);
+
+   //struct timespec tv[2];
+   //clock_gettime(CLOCK_MONOTONIC, &tv[0]);
+   scaler_ctx_scale(&ctx, scale_buf, imlib_image_get_data_for_reading_only());
+   //clock_gettime(CLOCK_MONOTONIC, &tv[1]);
+
+   //double time_ms = (tv[1].tv_sec - tv[0].tv_sec) * 1000.0 + (tv[1].tv_nsec - tv[0].tv_nsec) / 1000000.0;
+   //double ns_per_pix = (1000000.0 * time_ms) / (ctx.out_width * ctx.out_height);
+   //printf("Time: %.3lf ms, %.3lf ns / pixel\n", time_ms, ns_per_pix);
+
+   Imlib_Image new_img = imlib_create_image_using_data(ctx.out_width, ctx.out_height,
+         scale_buf);
+
+   imlib_free_image();
+   imlib_context_set_image(new_img);
+
+   const char *fmt = strrchr(g_out_path, '.');
+   if (fmt)
+      fmt++;
+   else
+      fmt = "png";
+
+   imlib_image_set_format(fmt);
+   imlib_save_image(g_out_path);
+   imlib_free_image();
+
+   free(scale_buf);
+   free(g_in_path);
+   free(g_out_path);
+
+   scaler_ctx_gen_reset(&ctx);
+}
+
--- a/gfx/scaler/pixconv.c
+++ b/gfx/scaler/pixconv.c
@ -0,0 +1,131 @@
+#include "pixconv.h"
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+void conv_0rgb1555_argb8888(void *output_, const void *input_,
+      int width, int height,
+      int out_stride, int in_stride)
+{
+   const uint16_t *input = (const uint16_t*)input_;
+   uint32_t *output      = (uint32_t*)output_;
+
+   for (int h = 0; h < height; h++, output += out_stride >> 2, input += in_stride >> 1)
+   {
+      for (int w = 0; w < width; w++)
+      {
+         uint32_t col = input[w];
+         uint32_t r = (col >> 10) & 0x1f;
+         uint32_t g = (col >>  5) & 0x1f;
+         uint32_t b = (col >>  0) & 0x1f;
+         r = (r << 3) | (r >> 2);
+         g = (g << 3) | (g >> 2);
+         b = (b << 3) | (b >> 2);
+
+         output[w] = (0xff << 24) | (r << 16) | (g << 8) | (b << 0);
+      }
+   }
+}
+
+void conv_0rgb1555_bgr24(void *output_, const void *input_,
+      int width, int height,
+      int out_stride, int in_stride)
+{
+   const uint16_t *input = (const uint16_t*)input_;
+   uint8_t *output       = (uint8_t*)output_;
+
+   for (int h = 0; h < height; h++, output += out_stride, input += in_stride >> 1)
+   {
+      uint8_t *out = output;
+      for (int w = 0; w < width; w++)
+      {
+         uint32_t col = input[w];
+         uint32_t b = (col >>  0) & 0x1f;
+         uint32_t g = (col >>  5) & 0x1f;
+         uint32_t r = (col >> 10) & 0x1f;
+         b = (b << 3) | (b >> 2);
+         g = (g << 3) | (g >> 2);
+         r = (r << 3) | (r >> 2);
+
+         *out++ = b;
+         *out++ = g;
+         *out++ = r;
+      }
+   }
+}
+
+void conv_bgr24_argb8888(void *output_, const void *input_,
+      int width, int height,
+      int out_stride, int in_stride)
+{
+   const uint8_t *input = (const uint8_t*)input_;
+   uint32_t *output     = (uint32_t*)output_;
+
+   for (int h = 0; h < height; h++, output += out_stride >> 2, input += in_stride)
+   {
+      const uint8_t *inp = input;
+      for (int w = 0; w < width; w++)
+      {
+         uint32_t b = *inp++;
+         uint32_t g = *inp++;
+         uint32_t r = *inp++;
+         output[w] = (0xff << 24) | (r << 16) | (g << 8) | (b << 0);
+      }
+   }
+}
+
+void conv_argb8888_0rgb1555(void *output_, const void *input_,
+      int width, int height,
+      int out_stride, int in_stride)
+{
+   const uint32_t *input = (const uint32_t*)input_;
+   uint16_t *output      = (uint16_t*)output_;
+
+   for (int h = 0; h < height; h++, output += out_stride >> 1, input += in_stride >> 2)
+   {
+      for (int w = 0; w < width; w++)
+      {
+         uint32_t col = input[w];
+         uint16_t r = (col >> 19) & 0x1f;
+         uint16_t g = (col >> 11) & 0x1f;
+         uint16_t b = (col >>  3) & 0x1f;
+         output[w] = (r << 10) | (g << 5) | (b << 0);
+      }
+   }
+}
+
+void conv_argb8888_bgr24(void *output_, const void *input_,
+      int width, int height,
+      int out_stride, int in_stride)
+{
+   const uint32_t *input = (const uint32_t*)input_;
+   uint8_t *output = (uint8_t*)output_;
+
+   for (int h = 0; h < height; h++, output += out_stride, input += in_stride >> 2)
+   {
+      uint8_t *out = output;
+      for (int w = 0; w < width; w++)
+      {
+         uint32_t col = input[w];
+         *out++ = (uint8_t)(col >>  0);
+         *out++ = (uint8_t)(col >>  8);
+         *out++ = (uint8_t)(col >> 16);
+      }
+   }
+}
+
+void conv_copy(void *output_, const void *input_,
+      int width, int height,
+      int out_stride, int in_stride)
+{
+   int copy_len = abs(out_stride);
+   if (abs(in_stride) < copy_len)
+      copy_len = abs(in_stride);
+
+   const uint8_t *input = (const uint8_t*)input_;
+   uint8_t *output      = (uint8_t*)output_;
+
+   for (int h = 0; h < height; h++, output += out_stride, input += in_stride)
+      memcpy(output, input, copy_len);
+}
+
--- a/gfx/scaler/pixconv.h
+++ b/gfx/scaler/pixconv.h
@ -0,0 +1,29 @@
+#ifndef PIXCONV_H__
+#define PIXCONV_H__
+
+void conv_0rgb1555_argb8888(void *output, const void *input,
+      int width, int height,
+      int out_stride, int in_stride);
+
+void conv_bgr24_argb8888(void *output, const void *input,
+      int width, int height,
+      int out_stride, int in_stride);
+
+void conv_argb8888_0rgb1555(void *output, const void *input,
+      int width, int height,
+      int out_stride, int in_stride);
+
+void conv_argb8888_bgr24(void *output, const void *input,
+      int width, int height,
+      int out_stride, int in_stride);
+
+void conv_0rgb1555_bgr24(void *output, const void *input,
+      int width, int height,
+      int out_stride, int in_stride);
+
+void conv_copy(void *output, const void *input,
+      int width, int height,
+      int out_stride, int in_stride);
+
+#endif
+
--- a/gfx/scaler/scaler.c
+++ b/gfx/scaler/scaler.c
@ -0,0 +1,195 @@
+#include "scaler.h"
+#include "scaler_int.h"
+#include "filter.h"
+#include "pixconv.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+
+// In case aligned allocs are needed later ...
+void *scaler_alloc(size_t elem_size, size_t size)
+{
+   return calloc(elem_size, size);
+}
+
+void scaler_free(void *ptr)
+{
+   free(ptr);
+}
+
+static bool allocate_frames(struct scaler_ctx *ctx)
+{
+   ctx->scaled.stride = ((ctx->out_width + 7) & ~7) * sizeof(uint64_t);
+   ctx->scaled.width  = ctx->out_width;
+   ctx->scaled.height = ctx->in_height;
+   ctx->scaled.frame  = (uint64_t*)scaler_alloc(sizeof(uint64_t), (ctx->scaled.stride * ctx->scaled.height) >> 3);
+   if (!ctx->scaled.frame)
+      return false;
+
+   if (ctx->in_fmt != SCALER_FMT_ARGB8888)
+   {
+      ctx->input.stride = ((ctx->in_width + 7) & ~7) * sizeof(uint32_t);
+      ctx->input.frame = (uint32_t*)scaler_alloc(sizeof(uint32_t), (ctx->input.stride * ctx->in_height) >> 2);
+      if (!ctx->input.frame)
+         return false;
+   }
+
+   if (ctx->out_fmt != SCALER_FMT_ARGB8888)
+   {
+      ctx->output.stride = ((ctx->out_width + 7) & ~7) * sizeof(uint32_t);
+      ctx->output.frame  = (uint32_t*)scaler_alloc(sizeof(uint32_t), (ctx->output.stride * ctx->out_height) >> 2);
+      if (!ctx->output.frame)
+         return false;
+   }
+
+   return true;
+}
+
+static bool set_direct_pix_conv(struct scaler_ctx *ctx)
+{
+   if (ctx->in_fmt == ctx->out_fmt)
+      ctx->direct_pixconv = conv_copy;
+   else if (ctx->in_fmt == SCALER_FMT_0RGB1555 && ctx->out_fmt == SCALER_FMT_ARGB8888)
+      ctx->direct_pixconv = conv_0rgb1555_argb8888;
+   else if (ctx->in_fmt == SCALER_FMT_BGR24 && ctx->out_fmt == SCALER_FMT_ARGB8888)
+      ctx->direct_pixconv = conv_bgr24_argb8888;
+   else if (ctx->in_fmt == SCALER_FMT_ARGB8888 && ctx->out_fmt == SCALER_FMT_0RGB1555)
+      ctx->direct_pixconv = conv_argb8888_0rgb1555;
+   else if (ctx->in_fmt == SCALER_FMT_ARGB8888 && ctx->out_fmt == SCALER_FMT_BGR24)
+      ctx->direct_pixconv = conv_argb8888_bgr24;
+   else if (ctx->in_fmt == SCALER_FMT_0RGB1555 && ctx->out_fmt == SCALER_FMT_BGR24)
+      ctx->direct_pixconv = conv_0rgb1555_bgr24;
+   else
+      return false;
+
+   return true;
+}
+
+static bool set_pix_conv(struct scaler_ctx *ctx)
+{
+   switch (ctx->in_fmt)
+   {
+      case SCALER_FMT_ARGB8888:
+         // No need to convert :D
+         break;
+
+      case SCALER_FMT_0RGB1555:
+         ctx->in_pixconv = conv_0rgb1555_argb8888;
+         break;
+
+      case SCALER_FMT_BGR24:
+         ctx->in_pixconv = conv_bgr24_argb8888;
+         break;
+
+      default:
+         return false;
+   }
+
+   switch (ctx->out_fmt)
+   {
+      case SCALER_FMT_ARGB8888:
+         // No need to convert :D
+         break;
+
+      case SCALER_FMT_0RGB1555:
+         ctx->out_pixconv = conv_argb8888_0rgb1555;
+         break;
+
+      case SCALER_FMT_BGR24:
+         ctx->out_pixconv = conv_argb8888_bgr24;
+         break;
+
+      default:
+         return false;
+   }
+
+   return true;
+}
+
+bool scaler_ctx_gen_filter(struct scaler_ctx *ctx)
+{
+   scaler_ctx_gen_reset(ctx);
+
+   if (ctx->in_width == ctx->out_width && ctx->in_height == ctx->out_height)
+      ctx->unscaled = true; // Only pixel format conversion ...
+   else
+   {
+      ctx->scaler_horiz = scaler_argb8888_horiz;
+      ctx->scaler_vert  = scaler_argb8888_vert;
+      ctx->unscaled     = false;
+   }
+
+   if (!allocate_frames(ctx))
+      return false;
+
+   if (ctx->unscaled)
+   {
+      if (!set_direct_pix_conv(ctx))
+         return false;
+   }
+   else
+   {
+      if (!set_pix_conv(ctx))
+         return false;
+   }
+
+   if (!ctx->unscaled && !scaler_gen_filter(ctx))
+      return false;
+
+   return true;
+}
+
+void scaler_ctx_gen_reset(struct scaler_ctx *ctx)
+{
+   scaler_free(ctx->horiz.filter);
+   scaler_free(ctx->horiz.filter_pos);
+   scaler_free(ctx->vert.filter);
+   scaler_free(ctx->vert.filter_pos);
+   scaler_free(ctx->scaled.frame);
+   scaler_free(ctx->input.frame);
+   scaler_free(ctx->output.frame);
+
+   memset(&ctx->horiz, 0, sizeof(ctx->horiz));
+   memset(&ctx->vert, 0, sizeof(ctx->vert));
+   memset(&ctx->scaled, 0, sizeof(ctx->scaled));
+   memset(&ctx->input, 0, sizeof(ctx->input));
+   memset(&ctx->output, 0, sizeof(ctx->output));
+}
+
+void scaler_ctx_scale(const struct scaler_ctx *ctx,
+      void *output, const void *input)
+{
+   if (ctx->unscaled)
+   {
+      ctx->direct_pixconv(output, input,
+            ctx->out_width, ctx->out_height,
+            ctx->out_stride, ctx->in_stride);
+   }
+   else
+   {
+      if (ctx->in_fmt != SCALER_FMT_ARGB8888)
+      {
+         ctx->in_pixconv(ctx->input.frame, input,
+               ctx->in_width, ctx->in_height,
+               ctx->input.stride, ctx->in_stride);
+
+         ctx->scaler_horiz(ctx, ctx->input.frame, ctx->input.stride);
+      }
+      else
+         ctx->scaler_horiz(ctx, input, ctx->in_stride);
+
+      if (ctx->out_fmt != SCALER_FMT_ARGB8888)
+      {
+         ctx->scaler_vert(ctx, ctx->output.frame, ctx->output.stride);
+
+         ctx->out_pixconv(output, ctx->output.frame,
+               ctx->out_width, ctx->out_height,
+               ctx->out_stride, ctx->output.stride);
+      }
+      else
+         ctx->scaler_vert(ctx, output, ctx->out_stride);
+   }
+}
+
+
--- a/gfx/scaler/scaler.h
+++ b/gfx/scaler/scaler.h
@ -0,0 +1,90 @@
+#ifndef SCALER_H__
+#define SCALER_H__
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FILTER_UNITY (1 << 14)
+
+enum scaler_pix_fmt
+{
+   SCALER_FMT_ARGB8888 = 0,
+   SCALER_FMT_0RGB1555,
+   SCALER_FMT_BGR24
+};
+
+enum scaler_type
+{
+   SCALER_TYPE_UNKNOWN = 0,
+   SCALER_TYPE_POINT,
+   SCALER_TYPE_BILINEAR,
+   SCALER_TYPE_SINC
+};
+
+struct scaler_filter
+{
+   int16_t *filter;
+   size_t   filter_len;
+   size_t   filter_stride;
+   int     *filter_pos;
+};
+
+struct scaler_ctx
+{
+   int in_width;
+   int in_height;
+   int in_stride;
+
+   int out_width;
+   int out_height;
+   int out_stride;
+
+   enum scaler_pix_fmt in_fmt;
+   enum scaler_pix_fmt out_fmt;
+   enum scaler_type scaler_type;
+
+   void (*scaler_horiz)(const struct scaler_ctx*,
+         const void*, int);
+   void (*scaler_vert)(const struct scaler_ctx*,
+         void*, int);
+
+   void (*in_pixconv)(void*, const void*, int, int, int, int);
+   void (*out_pixconv)(void*, const void*, int, int, int, int);
+   void (*direct_pixconv)(void*, const void*, int, int, int, int);
+
+   bool unscaled;
+   struct scaler_filter horiz, vert;
+
+   struct
+   {
+      uint32_t *frame;
+      int stride;
+   } input;
+
+   struct
+   {
+      uint64_t *frame;
+      int width;
+      int height;
+      int stride;
+   } scaled;
+
+   struct
+   {
+      uint32_t *frame;
+      int stride;
+   } output;
+};
+
+bool scaler_ctx_gen_filter(struct scaler_ctx *ctx);
+void scaler_ctx_gen_reset(struct scaler_ctx *ctx);
+
+void scaler_ctx_scale(const struct scaler_ctx *ctx,
+      void *output, const void *input);
+
+void *scaler_alloc(size_t elem_size, size_t size);
+void scaler_free(void *ptr);
+
+#endif
+
--- a/gfx/scaler/scaler_int.c
+++ b/gfx/scaler/scaler_int.c
@ -0,0 +1,214 @@
+#include "scaler_int.h"
+
+#if defined(__SSE2__)
+#include <emmintrin.h>
+#endif
+
+static inline uint64_t build_argb64(uint16_t a, uint16_t r, uint16_t g, uint16_t b)
+{
+   return ((uint64_t)a << 48) | ((uint64_t)r << 32) | ((uint64_t)g << 16) | ((uint64_t)b << 0);
+}
+
+static inline uint8_t clamp_8bit(int16_t col)
+{
+   if (col > 255)
+      return 255;
+   else if (col < 0)
+      return 0;
+   else
+      return (uint8_t)col;
+}
+
+// ARGB8888 scaler is split in two:
+//
+// First, horizontal scaler is applied.
+// Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7 to left to occupy 15 bits.
+// The sign bit is kept empty as we have to do signed multiplication for the filter.
+// A mulhi [(a * b) >> 16] is applied which loses some precision, but is very efficient for SIMD.
+// It is accurate enough for 8-bit purposes.
+//
+// The fixed point 1.0 for filter is (1 << 14). After horizontal scale, the output is kept
+// with 16-bit channels, and will now have 13 bits of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
+//
+// Vertical scaler takes the 13 bit channels, and performs the same mulhi steps.
+// Another 2 bits of precision is lost, which ends up as 11 bits.
+// Scaling is now complete. Channels are shifted right by 3, and saturated into 8-bit values.
+//
+// The C version of scalers perform the exact same operations as the SIMD code for testing purposes.
+
+#if defined(__SSE2__)
+void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
+{
+   const uint64_t *input = ctx->scaled.frame;
+   uint32_t *output = (uint32_t*)output_;
+
+   const int16_t *filter_vert = ctx->vert.filter;
+
+   for (int h = 0; h < ctx->out_height; h++, filter_vert += ctx->vert.filter_stride, output += stride >> 2)
+   {
+      const uint64_t *input_base = input + ctx->vert.filter_pos[h] * (ctx->scaled.stride >> 3);
+
+      for (int w = 0; w < ctx->out_width; w++)
+      {
+         __m128i res = _mm_setzero_si128();
+
+         const uint64_t *input_base_y = input_base + w;
+
+         size_t y;
+         for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2, input_base_y += (ctx->scaled.stride >> 2))
+         {
+            __m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll);
+            __m128i col   = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]);
+
+            res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
+         }
+
+         for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
+         {
+            __m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll);
+            __m128i col   = _mm_set_epi64x(0, input_base_y[0]);
+
+            res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
+         }
+
+         res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
+         res = _mm_srai_epi16(res, (7 - 2 - 2));
+
+         __m128i final = _mm_packus_epi16(res, res);
+
+         output[w] = _mm_cvtsi128_si32(final);
+      }
+   }
+}
+#else
+void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
+{
+   const uint64_t *input = ctx->scaled.frame;
+   uint32_t *output = output_;
+
+   const int16_t *filter_vert = ctx->vert.filter;
+
+   for (int h = 0; h < ctx->out_height; h++, filter_vert += ctx->vert.filter_stride, output += stride >> 2)
+   {
+      const uint64_t *input_base = input + ctx->vert.filter_pos[h] * (ctx->scaled.stride >> 3);
+
+      for (int w = 0; w < ctx->out_width; w++)
+      {
+         int16_t res_a = 0;
+         int16_t res_r = 0;
+         int16_t res_g = 0;
+         int16_t res_b = 0;
+
+         const uint64_t *input_base_y = input_base + w;
+         for (size_t y = 0; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
+         {
+            uint64_t col = *input_base_y;
+
+            int16_t a = (col >> 48) & 0xffff;
+            int16_t r = (col >> 32) & 0xffff;
+            int16_t g = (col >> 16) & 0xffff;
+            int16_t b = (col >>  0) & 0xffff;
+
+            int16_t coeff = filter_vert[y];
+
+            res_a += (a * coeff) >> 16;
+            res_r += (r * coeff) >> 16;
+            res_g += (g * coeff) >> 16;
+            res_b += (b * coeff) >> 16;
+         }
+
+         res_a >>= (7 - 2 - 2);
+         res_r >>= (7 - 2 - 2);
+         res_g >>= (7 - 2 - 2);
+         res_b >>= (7 - 2 - 2);
+
+         output[w] = (clamp_8bit(res_a) << 24) | (clamp_8bit(res_r) << 16) | (clamp_8bit(res_g) << 8) | (clamp_8bit(res_b) << 0);
+      }
+   }
+}
+#endif
+
+#if defined(__SSE2__)
+void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
+{
+   const uint32_t *input = (const uint32_t*)input_;
+   uint64_t *output      = ctx->scaled.frame;
+
+   for (int h = 0; h < ctx->scaled.height; h++, input += stride >> 2, output += ctx->scaled.stride >> 3)
+   {
+      const int16_t *filter_horiz = ctx->horiz.filter;
+
+      for (int w = 0; w < ctx->scaled.width; w++, filter_horiz += ctx->horiz.filter_stride)
+      {
+         __m128i res = _mm_setzero_si128();
+
+         const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
+
+         size_t x;
+         for (x = 0; (x + 1) < ctx->horiz.filter_len; x += 2)
+         {
+            __m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll);
+
+            __m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0,
+                     ((uint64_t)input_base_x[x + 1] << 32) | input_base_x[x + 0]), _mm_setzero_si128());
+
+            col = _mm_slli_epi16(col, 7);
+            res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
+         }
+
+         for (; x < ctx->horiz.filter_len; x++)
+         {
+            __m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll);
+            __m128i col   = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128());
+
+            col = _mm_slli_epi16(col, 7);
+            res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
+         }
+
+         res       = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
+         output[w] = _mm_cvtsi128_si64(res);
+      }
+   }
+}
+#else
+void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
+{
+   const uint32_t *input = input_;
+   uint64_t *output      = ctx->scaled.frame;
+
+   for (int h = 0; h < ctx->scaled.height; h++, input += stride >> 2, output += ctx->scaled.stride >> 3)
+   {
+      const int16_t *filter_horiz = ctx->horiz.filter;
+
+      for (int w = 0; w < ctx->scaled.width; w++, filter_horiz += ctx->horiz.filter_stride)
+      {
+         const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
+
+         int16_t res_a = 0;
+         int16_t res_r = 0;
+         int16_t res_g = 0;
+         int16_t res_b = 0;
+
+         for (size_t x = 0; x < ctx->horiz.filter_len; x++)
+         {
+            uint32_t col = input_base_x[x];
+
+            int16_t a = (col >> (24 - 7)) & (0xff << 7);
+            int16_t r = (col >> (16 - 7)) & (0xff << 7);
+            int16_t g = (col >> ( 8 - 7)) & (0xff << 7);
+            int16_t b = (col << ( 0 + 7)) & (0xff << 7);
+
+            int16_t coeff = filter_horiz[x];
+
+            res_a += (a * coeff) >> 16;
+            res_r += (r * coeff) >> 16;
+            res_g += (g * coeff) >> 16;
+            res_b += (b * coeff) >> 16;
+         }
+
+         output[w] = build_argb64(res_a, res_r, res_g, res_b);
+      }
+   }
+}
+#endif
+
--- a/gfx/scaler/scaler_int.h
+++ b/gfx/scaler/scaler_int.h
@ -0,0 +1,10 @@
+#ifndef SCALER_INT_H__
+#define SCALER_INT_H__
+
+#include "scaler.h"
+
+void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output, int stride);
+void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input, int stride);
+
+#endif
+
--- a/gfx/sdl_gfx.c
+++ b/gfx/sdl_gfx.c
@ -19,6 +19,7 @@
 #include <string.h>
 #include "../general.h"
 #include "../input/rarch_sdl_input.h"
+#include "scaler/scaler.h"
 #include "gfx_common.h"
 #include "gfx_context.h"

@ -53,6 +54,10 @@ typedef struct sdl_video
   uint8_t font_g;
   uint8_t font_b;
 #endif
+
+   struct scaler_ctx scaler;
+   unsigned last_width;
+   unsigned last_height;
 } sdl_video_t;

 static void sdl_gfx_free(void *data)
@ -71,6 +76,8 @@ static void sdl_gfx_free(void *data)
      font_renderer_free(vid->font);
 #endif

+   scaler_ctx_gen_reset(&vid->scaler);
+
   free(vid);
 }

@ -268,23 +275,18 @@ static void *sdl_gfx_init(const video_info_t *video, const input_driver_t **inpu
   if (!video->fullscreen)
      RARCH_LOG("Creating window @ %ux%u\n", video->width, video->height);

-   vid->render32 = video->rgb32 && !g_settings.video.force_16bit;
+   vid->render32 = !g_settings.video.force_16bit;
   vid->screen = SDL_SetVideoMode(video->width, video->height, vid->render32 ? 32 : 15, SDL_HWSURFACE | SDL_HWACCEL | SDL_DOUBLEBUF | (video->fullscreen ? SDL_FULLSCREEN : 0));

-   if (!vid->screen && !g_settings.video.force_16bit && !video->rgb32)
-   {
-      vid->upsample = true;
-      vid->screen = SDL_SetVideoMode(video->width, video->height, 32, SDL_HWSURFACE | SDL_HWACCEL | SDL_DOUBLEBUF | (video->fullscreen ? SDL_FULLSCREEN : 0));
-      RARCH_WARN("SDL: 15-bit colors failed, attempting 32-bit colors.\n");
-      vid->render32 = true;
-   }
-
   if (!vid->screen)
   {
      RARCH_ERR("Failed to init SDL surface: %s\n", SDL_GetError());
      goto error;
   }

+   if (!video->rgb32 && vid->render32)
+      vid->upsample = true;
+
   SDL_ShowCursor(SDL_DISABLE);

 #ifdef HAVE_X11
@ -358,6 +360,10 @@ static void *sdl_gfx_init(const video_info_t *video, const input_driver_t **inpu
      vid->convert_32_func = convert_32bit_32bit_shift;
   }

+   vid->scaler.scaler_type = video->smooth ? SCALER_TYPE_BILINEAR : SCALER_TYPE_POINT;
+   vid->scaler.in_fmt  = vid->render32 ? SCALER_FMT_ARGB8888 : SCALER_FMT_0RGB1555;
+   vid->scaler.out_fmt = vid->scaler.in_fmt;
+
   return vid;

 error:
@ -375,13 +381,20 @@ static inline uint16_t conv_pixel_32_15(uint32_t pix, const SDL_PixelFormat *fmt

 static inline uint32_t conv_pixel_15_32(uint16_t pix, const SDL_PixelFormat *fmt)
 {
-   uint32_t r = ((pix >> 10) & 0x1f) << (fmt->Rshift + 3);
-   uint32_t g = ((pix >>  5) & 0x1f) << (fmt->Gshift + 3);
-   uint32_t b = ((pix >>  0) & 0x1f) << (fmt->Bshift + 3);
-   return r | g | b;
+   uint32_t r = (pix >> 10) & 0x1f;
+   uint32_t g = (pix >>  5) & 0x1f;
+   uint32_t b = (pix >>  0) & 0x1f;
+
+   r = (r << 3) | (r >> 2);
+   g = (g << 3) | (g >> 2);
+   b = (b << 3) | (b >> 2);
+
+   return (r << fmt->Rshift) | (g << fmt->Gshift) | (b << fmt->Bshift);
 }

-static void convert_32bit_15bit(uint16_t *out, unsigned outpitch, const uint32_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt)
+static void convert_32bit_15bit(uint16_t *out, unsigned outpitch,
+      const uint32_t *input, unsigned width, unsigned height,
+      unsigned pitch, const SDL_PixelFormat *fmt)
 {
   for (unsigned y = 0; y < height; y++)
   {
@ -393,7 +406,9 @@ static void convert_32bit_15bit(uint16_t *out, unsigned outpitch, const uint32_t
   }
 }

-static void convert_15bit_32bit(uint32_t *out, unsigned outpitch, const uint16_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt)
+static void convert_15bit_32bit(uint32_t *out, unsigned outpitch,
+      const uint16_t *input, unsigned width, unsigned height,
+      unsigned pitch, const SDL_PixelFormat *fmt)
 {
   for (unsigned y = 0; y < height; y++)
   {
@ -405,7 +420,9 @@ static void convert_15bit_32bit(uint32_t *out, unsigned outpitch, const uint16_t
   }
 }

-static void convert_15bit_15bit_direct(uint16_t *out, unsigned outpitch, const uint16_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt)
+static void convert_15bit_15bit_direct(uint16_t *out, unsigned outpitch,
+      const uint16_t *input, unsigned width, unsigned height,
+      unsigned pitch, const SDL_PixelFormat *fmt)
 {
   for (unsigned y = 0; y < height; y++)
   {
@ -416,7 +433,9 @@ static void convert_15bit_15bit_direct(uint16_t *out, unsigned outpitch, const u
   (void)fmt;
 }

-static void convert_32bit_32bit_direct(uint32_t *out, unsigned outpitch, const uint32_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt)
+static void convert_32bit_32bit_direct(uint32_t *out, unsigned outpitch,
+      const uint32_t *input, unsigned width, unsigned height,
+      unsigned pitch, const SDL_PixelFormat *fmt)
 {
   for (unsigned y = 0; y < height; y++)
   {
@ -427,12 +446,15 @@ static void convert_32bit_32bit_direct(uint32_t *out, unsigned outpitch, const u
   (void)fmt;
 }

-static void convert_15bit_15bit_shift(uint16_t *out, unsigned outpitch, const uint16_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt)
+static void convert_15bit_15bit_shift(uint16_t *out, unsigned outpitch,
+      const uint16_t *input, unsigned width, unsigned height,
+      unsigned pitch, const SDL_PixelFormat *fmt)
 {
   for (unsigned y = 0; y < height; y++)
   {
-      uint16_t *dest = out + ((y * outpitch) >> 1);
+      uint16_t *dest      = out + ((y * outpitch) >> 1);
      const uint16_t *src = input + ((y * pitch) >> 1);
+
      for (unsigned x = 0; x < width; x++)
      {
         uint16_t color = src[x];
@ -444,12 +466,15 @@ static void convert_15bit_15bit_shift(uint16_t *out, unsigned outpitch, const ui
   }
 }

-static void convert_32bit_32bit_shift(uint32_t *out, unsigned outpitch, const uint32_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt)
+static void convert_32bit_32bit_shift(uint32_t *out, unsigned outpitch,
+      const uint32_t *input, unsigned width, unsigned height,
+      unsigned pitch, const SDL_PixelFormat *fmt)
 {
   for (unsigned y = 0; y < height; y++)
   {
-      uint32_t *dest = out + ((y * outpitch) >> 2);
+      uint32_t *dest      = out + ((y * outpitch) >> 2);
      const uint32_t *src = input + ((y * pitch) >> 2);
+
      for (unsigned x = 0; x < width; x++)
      {
         uint32_t color = src[x];
@ -488,43 +513,51 @@ static bool sdl_gfx_frame(void *data, const void *frame, unsigned width, unsigne
   if (SDL_MUSTLOCK(vid->buffer))
      SDL_LockSurface(vid->buffer);

-   // :(
-   // 15-bit -> 32-bit (Sometimes 15-bit won't work on "modern" OSes :\)
+   // 15-bit -> 32-bit.
   if (vid->upsample)
      convert_15bit_32bit((uint32_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint16_t*)frame, width, height, pitch, vid->screen->format);
   // 15-bit -> 15-bit
   else if (!vid->rgb32)
      vid->convert_15_func((uint16_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint16_t*)frame, width, height, pitch, vid->screen->format);
   // 32-bit -> 15-bit
-   else if (vid->rgb32 && g_settings.video.force_16bit)
+   else if (vid->rgb32 && !vid->render32)
      convert_32bit_15bit((uint16_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint32_t*)frame, width, height, pitch, vid->screen->format);
   // 32-bit -> 32-bit
   else
      vid->convert_32_func((uint32_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint32_t*)frame, width, height, pitch, vid->screen->format);

+   if (width != vid->last_width || height != vid->last_height)
+   {
+      vid->scaler.in_width  = width;
+      vid->scaler.in_height = height;
+      vid->scaler.in_stride = vid->buffer->pitch;
+
+      vid->scaler.out_width  = vid->screen->w;
+      vid->scaler.out_height = vid->screen->h;
+      vid->scaler.out_stride = vid->screen->pitch;
+
+      scaler_ctx_gen_filter(&vid->scaler);
+
+      vid->last_width  = width;
+      vid->last_height = height;
+   }
+
+   if (SDL_MUSTLOCK(vid->screen))
+      SDL_LockSurface(vid->screen);
+
+   scaler_ctx_scale(&vid->scaler, vid->screen->pixels, vid->buffer->pixels);
+
   if (SDL_MUSTLOCK(vid->buffer))
      SDL_UnlockSurface(vid->buffer);
-
-   SDL_Rect src = {0};
-   src.x = 0;
-   src.y = 0;
-   src.w = width;
-   src.h = height;
-
-   SDL_Rect dest = {0};
-   dest.x = 0;
-   dest.y = 0;
-   dest.w = vid->screen->w;
-   dest.h = vid->screen->h;
-
-   SDL_SoftStretch(vid->buffer, &src, vid->screen, &dest);
+   if (SDL_MUSTLOCK(vid->screen))
+      SDL_UnlockSurface(vid->screen);

   if (msg)
   {
-      if ((!vid->rgb32 || g_settings.video.force_16bit) && !vid->upsample)
-         sdl_render_msg_15(vid, vid->screen, msg, vid->screen->w, vid->screen->h, vid->screen->format);
-      else
+      if (vid->render32)
         sdl_render_msg_32(vid, vid->screen, msg, vid->screen->w, vid->screen->h, vid->screen->format);
+      else
+         sdl_render_msg_15(vid, vid->screen, msg, vid->screen->w, vid->screen->h, vid->screen->format);
   }

   char buf[128];