Bug 1266491 - Use an ssse3 scaler for video. r=mstange

This is a separable scaler that improves from performance from 15-16fps to 21-22fps
2025-02-27 12:50:09 +00:00 · 2016-05-28 10:12:12 -04:00 · 2016-05-28 10:12:12 -04:00 · a5fc6a819f
commit a5fc6a819f
parent 85d22c8bbc
4 changed files with 659 additions and 0 deletions
--- a/gfx/2d/moz.build
+++ b/gfx/2d/moz.build
@ -54,6 +54,8 @@ EXPORTS.mozilla.gfx += [
    'UserData.h',
 ]

+EXPORTS.mozilla.gfx += ['ssse3-scaler.h']
+
 if CONFIG['MOZ_WIDGET_TOOLKIT'] in ('cocoa', 'uikit'):
    EXPORTS.mozilla.gfx += [
        'MacIOSurface.h',
@ -111,6 +113,7 @@ if CONFIG['INTEL_ARCHITECTURE']:
        'BlurSSE2.cpp',
        'FilterProcessingSSE2.cpp',
        'ImageScalingSSE2.cpp',
+        'ssse3-scaler.c',
    ]
    if CONFIG['MOZ_ENABLE_SKIA']:
        SOURCES += [
@ -122,6 +125,7 @@ if CONFIG['INTEL_ARCHITECTURE']:
    SOURCES['BlurSSE2.cpp'].flags += CONFIG['SSE2_FLAGS']
    SOURCES['FilterProcessingSSE2.cpp'].flags += CONFIG['SSE2_FLAGS']
    SOURCES['ImageScalingSSE2.cpp'].flags += CONFIG['SSE2_FLAGS']
+    SOURCES['ssse3-scaler.c'].flags += CONFIG['SSSE3_FLAGS']
    if CONFIG['MOZ_ENABLE_SKIA']:
        SOURCES['convolverSSE2.cpp'].flags += CONFIG['SSE2_FLAGS']
 elif CONFIG['CPU_ARCH'].startswith('mips'):
--- a/gfx/2d/ssse3-scaler.c
+++ b/gfx/2d/ssse3-scaler.c
@ -0,0 +1,559 @@
+/*
+ * Copyright © 2013 Soren Sandmann Pedersen
+ * Copyright © 2013 Red Hat, Inc.
+ * Copyright © 2016 Mozilla Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann (soren.sandmann@gmail.com)
+ *         Jeff Muizelaar (jmuizelaar@mozilla.com)
+ */
+
+/* This has been adapted from the ssse3 code from pixman. It's currently
+ * a mess as I want to try it out in practice before finalizing the details.
+ */
+
+#include <stdlib.h>
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <stdint.h>
+#include <assert.h>
+
+typedef int32_t                 pixman_fixed_16_16_t;
+typedef pixman_fixed_16_16_t    pixman_fixed_t;
+#define pixman_fixed_1                  (pixman_int_to_fixed(1))
+#define pixman_fixed_to_int(f)          ((int) ((f) >> 16))
+#define pixman_int_to_fixed(i)          ((pixman_fixed_t) ((i) << 16))
+#define pixman_double_to_fixed(d)       ((pixman_fixed_t) ((d) * 65536.0))
+typedef struct pixman_vector pixman_vector_t;
+
+typedef int pixman_bool_t;
+typedef int64_t                 pixman_fixed_32_32_t;
+typedef pixman_fixed_32_32_t    pixman_fixed_48_16_t;
+typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t;
+
+struct pixman_vector
+{
+    pixman_fixed_t      vector[3];
+};
+typedef struct pixman_transform pixman_transform_t;
+
+struct pixman_transform
+{
+    pixman_fixed_t      matrix[3][3];
+};
+
+#ifdef _MSC_VER
+#define force_inline __forceinline
+#else
+#define force_inline __inline__ __attribute__((always_inline))
+#endif
+
+#define BILINEAR_INTERPOLATION_BITS 6
+
+static force_inline int
+pixman_fixed_to_bilinear_weight (pixman_fixed_t x)
+{
+    return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
+                               ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
+}
+
+static void
+pixman_transform_point_31_16_3d (const pixman_transform_t    *t,
+                                 const pixman_vector_48_16_t *v,
+                                 pixman_vector_48_16_t       *result)
+{
+    int i;
+    int64_t tmp[3][2];
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    for (i = 0; i < 3; i++)
+    {
+        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
+        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
+    }
+
+    result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+    result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+    result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16);
+}
+
+static pixman_bool_t
+pixman_transform_point_3d (const struct pixman_transform *transform,
+                           struct pixman_vector *         vector)
+{
+    pixman_vector_48_16_t tmp;
+    tmp.v[0] = vector->vector[0];
+    tmp.v[1] = vector->vector[1];
+    tmp.v[2] = vector->vector[2];
+
+    pixman_transform_point_31_16_3d (transform, &tmp, &tmp);
+
+    vector->vector[0] = tmp.v[0];
+    vector->vector[1] = tmp.v[1];
+    vector->vector[2] = tmp.v[2];
+
+    return vector->vector[0] == tmp.v[0] &&
+           vector->vector[1] == tmp.v[1] &&
+           vector->vector[2] == tmp.v[2];
+}
+
+
+struct bits_image_t
+{
+    uint32_t *                 bits;
+    int                        rowstride;
+    pixman_transform_t *transform;
+};
+
+typedef struct bits_image_t bits_image_t;
+typedef struct {
+    int unused;
+} pixman_iter_info_t;
+
+typedef struct pixman_iter_t pixman_iter_t;
+typedef void      (* pixman_iter_fini_t)         (pixman_iter_t *iter);
+
+struct pixman_iter_t
+{
+    int x, y;
+    pixman_iter_fini_t          fini;
+    bits_image_t *image;
+    uint32_t *                  buffer;
+    int width;
+    int height;
+    void *                      data;
+};
+
+typedef struct
+{
+    int		y;
+    uint64_t *	buffer;
+} line_t;
+
+typedef struct
+{
+    line_t		lines[2];
+    pixman_fixed_t	y;
+    pixman_fixed_t	x;
+    uint64_t		data[1];
+} bilinear_info_t;
+
+static void
+ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
+			int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
+{
+    uint32_t *bits = image->bits + y * image->rowstride;
+    __m128i vx = _mm_set_epi16 (
+	- (x + 1), x, - (x + 1), x,
+	- (x + ux + 1), x + ux,  - (x + ux + 1), x + ux);
+    __m128i vux = _mm_set_epi16 (
+	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
+	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
+    __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
+    __m128i *b = (__m128i *)line->buffer;
+    __m128i vrl0, vrl1;
+
+    while ((n -= 2) >= 0)
+    {
+        __m128i vw, vr, s;
+#ifdef HACKY_PADDING
+        if (pixman_fixed_to_int(x + ux) >= image->rowstride) {
+            vrl1 = _mm_setzero_si128();
+            printf("overread 2loop\n");
+         } else {
+                 if (pixman_fixed_to_int(x + ux) < 0)
+                         printf("underflow\n");
+        vrl1 = _mm_loadl_epi64(
+            (__m128i *)(bits + (pixman_fixed_to_int(x + ux) < 0 ? 0 : pixman_fixed_to_int(x + ux))));
+        }
+#else
+        vrl1 = _mm_loadl_epi64(
+            (__m128i *)(bits + pixman_fixed_to_int(x + ux)));
+#endif
+	/* vrl1: R1, L1 */
+
+    final_pixel:
+#ifdef HACKY_PADDING
+	vrl0 = _mm_loadl_epi64 (
+	    (__m128i *)(bits + (pixman_fixed_to_int (x) < 0 ? 0 : pixman_fixed_to_int (x))));
+#else
+        vrl0 = _mm_loadl_epi64 (
+	    (__m128i *)(bits + pixman_fixed_to_int (x)));
+#endif
+        /* vrl0: R0, L0 */
+
+	/* The weights are based on vx which is a vector of 
+	 *
+	 *    - (x + 1), x, - (x + 1), x,
+	 *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
+	 *
+	 * so the 16 bit weights end up like this:
+	 *
+	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
+	 *
+	 * and after shifting and packing, we get these bytes:
+	 *
+	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+	 *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+	 *
+	 * which means the first and the second input pixel 
+	 * have to be interleaved like this:
+	 *
+	 *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+	 *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+	 *
+	 * before maddubsw can be used.
+	 */
+
+	vw = _mm_add_epi16 (
+	    vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
+	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
+	 */
+
+	vw = _mm_packus_epi16 (vw, vw);
+	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+	 *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
+	 */
+	vx = _mm_add_epi16 (vx, vux);
+
+	x += 2 * ux;
+
+	vr = _mm_unpacklo_epi16 (vrl1, vrl0);
+	/* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
+
+	s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
+	/* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
+
+	vr = _mm_unpackhi_epi8 (vr, s);
+	/* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+	 *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+	 */
+
+	vr = _mm_maddubs_epi16 (vr, vw);
+
+	/* When the weight is 0, the inverse weight is
+	 * 128 which can't be represented in a signed byte.
+	 * As a result maddubsw computes the following:
+	 *
+	 *     r = l * -128 + r * 0
+	 *
+	 * rather than the desired
+	 *
+	 *     r = l * 128 + r * 0
+	 *
+	 * We fix this by taking the absolute value of the
+	 * result.
+	 */
+        // we can drop this if we use lower precision
+
+	vr = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (2, 0, 3, 1));
+	/* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
+	_mm_store_si128 (b++, vr);
+    }
+
+    if (n == -1)
+    {
+	vrl1 = _mm_setzero_si128();
+	goto final_pixel;
+    }
+
+    line->y = y;
+}
+
+// scale a line of destination pixels
+static uint32_t *
+ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_fixed_t fx, ux;
+    bilinear_info_t *info = iter->data;
+    line_t *line0, *line1;
+    int y0, y1;
+    int32_t dist_y;
+    __m128i vw, uvw;
+    int i;
+
+    fx = info->x;
+    ux = iter->image->transform->matrix[0][0];
+
+    y0 = pixman_fixed_to_int (info->y);
+    if (y0 < 0)
+        *(volatile char*)0 = 9;
+    y1 = y0 + 1;
+
+    // clamping in y direction
+    if (y1 >= iter->height) {
+        y1 = iter->height - 1;
+    }
+
+    line0 = &info->lines[y0 & 0x01];
+    line1 = &info->lines[y1 & 0x01];
+
+    if (line0->y != y0)
+    {
+	ssse3_fetch_horizontal (
+	    iter->image, line0, y0, fx, ux, iter->width);
+    }
+
+    if (line1->y != y1)
+    {
+	ssse3_fetch_horizontal (
+	    iter->image, line1, y1, fx, ux, iter->width);
+    }
+
+#ifdef PIXMAN_STYLE_INTERPOLATION
+    dist_y = pixman_fixed_to_bilinear_weight (info->y);
+    dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
+
+    vw = _mm_set_epi16 (
+	dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
+
+#else
+    // setup the weights for the top (vw) and bottom (uvw) lines
+    dist_y = pixman_fixed_to_bilinear_weight (info->y);
+    // we use 15 instead of 16 because we need an extra bit to handle when the weights are 0 and 1
+    dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS);
+
+    vw = _mm_set_epi16 (
+	dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
+
+
+    dist_y = (1 << BILINEAR_INTERPOLATION_BITS) - pixman_fixed_to_bilinear_weight (info->y);
+    dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS);
+    uvw = _mm_set_epi16 (
+	dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
+#endif
+
+    for (i = 0; i + 3 < iter->width; i += 4)
+    {
+	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
+	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
+	__m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
+	__m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
+#ifdef PIXMAN_STYLE_INTERPOLATION
+	__m128i r0, r1, tmp, p;
+
+        r0 = _mm_mulhi_epu16 (
+	    _mm_sub_epi16 (bot0, top0), vw);
+	tmp = _mm_cmplt_epi16 (bot0, top0);
+	tmp = _mm_and_si128 (tmp, vw);
+	r0 = _mm_sub_epi16 (r0, tmp);
+	r0 = _mm_add_epi16 (r0, top0);
+	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
+	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
+        //r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
+	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
+
+        // tmp = bot1 < top1 ? vw : 0;
+        // r1 = (bot1 - top1)*vw + top1 - tmp
+        // r1 = bot1*vw - vw*top1 + top1 - tmp
+        // r1 = bot1*vw + top1 - vw*top1 - tmp
+        // r1 = bot1*vw + top1*(1 - vw) - tmp
+	r1 = _mm_mulhi_epu16 (
+	    _mm_sub_epi16 (bot1, top1), vw);
+	tmp = _mm_cmplt_epi16 (bot1, top1);
+	tmp = _mm_and_si128 (tmp, vw);
+	r1 = _mm_sub_epi16 (r1, tmp);
+	r1 = _mm_add_epi16 (r1, top1);
+	r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
+	//r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
+	/* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
+#else
+	__m128i r0, r1, p;
+        top0 = _mm_mulhi_epu16 (top0, uvw);
+        bot0 = _mm_mulhi_epu16 (bot0, vw);
+        r0 = _mm_add_epi16(top0, bot0);
+        r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS-1);
+
+        top1 = _mm_mulhi_epu16 (top1, uvw);
+        bot1 = _mm_mulhi_epu16 (bot1, vw);
+        r1 = _mm_add_epi16(top1, bot1);
+        r1 = _mm_srli_epi16(r1, BILINEAR_INTERPOLATION_BITS-1);
+#endif
+
+	p = _mm_packus_epi16 (r0, r1);
+	_mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
+    }
+
+    while (i < iter->width)
+    {
+	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
+	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
+
+#ifdef PIXMAN_STYLE_INTERPOLATION
+	__m128i r0, tmp, p;
+	r0 = _mm_mulhi_epu16 (
+	    _mm_sub_epi16 (bot0, top0), vw);
+	tmp = _mm_cmplt_epi16 (bot0, top0);
+	tmp = _mm_and_si128 (tmp, vw);
+	r0 = _mm_sub_epi16 (r0, tmp);
+	r0 = _mm_add_epi16 (r0, top0);
+	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
+	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
+	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
+	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
+#else
+	__m128i r0, p;
+        top0 = _mm_mulhi_epu16 (top0, uvw);
+        bot0 = _mm_mulhi_epu16 (bot0, vw);
+        r0 = _mm_add_epi16(top0, bot0);
+        r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS-1);
+#endif
+
+	p = _mm_packus_epi16 (r0, r0);
+
+	if (iter->width - i == 1)
+	{
+	    *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
+	    i++;
+	}
+	else
+	{
+	    _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
+	    i += 2;
+	}
+    }
+
+    info->y += iter->image->transform->matrix[1][1];
+
+    return iter->buffer;
+}
+
+static void
+ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
+{
+    free (iter->data);
+}
+
+static void
+ssse3_bilinear_cover_iter_init (pixman_iter_t *iter)
+{
+    int width = iter->width;
+    bilinear_info_t *info;
+    pixman_vector_t v;
+
+    /* Reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (iter->image->transform, &v))
+	goto fail;
+
+    info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64);
+    if (!info)
+	goto fail;
+
+    info->x = v.vector[0] - pixman_fixed_1 / 2;
+    info->y = v.vector[1] - pixman_fixed_1 / 2;
+
+#define ALIGN(addr)							\
+    ((void *)((((uintptr_t)(addr)) + 15) & (~15)))
+
+    /* It is safe to set the y coordinates to -1 initially
+     * because COVER_CLIP_BILINEAR ensures that we will only
+     * be asked to fetch lines in the [0, height) interval
+     */
+    info->lines[0].y = -1;
+    info->lines[0].buffer = ALIGN (&(info->data[0]));
+    info->lines[1].y = -1;
+    info->lines[1].buffer = ALIGN (info->lines[0].buffer + width);
+
+    iter->fini = ssse3_bilinear_cover_iter_fini;
+
+    iter->data = info;
+    return;
+
+fail:
+    /* Something went wrong, either a bad matrix or OOM; in such cases,
+     * we don't guarantee any particular rendering.
+     */
+    iter->fini = NULL;
+}
+
+/* scale the src from src_width/height to dest_width/height drawn
+ * into the rectangle x,y width,height
+ * src_stride and dst_stride are 4 byte units */
+void ssse3_scale_data(uint32_t *src, int src_width, int src_height, int src_stride,
+                      uint32_t *dest, int dest_width, int dest_height,
+                      int dest_stride,
+                      int x, int y,
+                      int width, int height)
+{
+    //XXX: assert(src_width > 1)
+    pixman_transform_t transform = {
+        { { pixman_fixed_1, 0, 0 },
+            { 0, pixman_fixed_1, 0 },
+            { 0, 0, pixman_fixed_1 } }
+    };
+    double width_scale = ((double)src_width)/dest_width;
+    double height_scale = ((double)src_height)/dest_height;
+#define AVOID_PADDING
+#ifdef AVOID_PADDING
+    // scale up by enough that we don't read outside of the bounds of the source surface
+    // currently this is required to avoid reading out of bounds.
+    if (width_scale < 1) {
+        width_scale = (double)(src_width-1)/dest_width;
+        transform.matrix[0][2] = pixman_fixed_1/2;
+    }
+    if (height_scale < 1) {
+        height_scale = (double)(src_height-1)/dest_height;
+        transform.matrix[1][2] = pixman_fixed_1/2;
+    }
+#endif
+    transform.matrix[0][0] = pixman_double_to_fixed(width_scale);
+    transform.matrix[1][1] = pixman_double_to_fixed(height_scale);
+    transform.matrix[2][2] = pixman_fixed_1;
+
+    bits_image_t image;
+    image.bits = src;
+    image.transform = &transform;
+    image.rowstride = src_stride;
+
+    pixman_iter_t iter;
+    iter.image = &image;
+    iter.x = x;
+    iter.y = y;
+    iter.width = width;
+    iter.height = src_height;
+    iter.buffer = dest;
+
+    ssse3_bilinear_cover_iter_init(&iter);
+    for (int iy = 0; iy < height; iy++) {
+        ssse3_fetch_bilinear_cover(&iter, NULL);
+        iter.buffer += dest_stride;
+    }
+    ssse3_bilinear_cover_iter_fini(&iter);
+
+}
--- a/gfx/2d/ssse3-scaler.h
+++ b/gfx/2d/ssse3-scaler.h
@ -0,0 +1,22 @@
+#/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef MOZILLA_GFX_2D_SSSE3_SCALER_H_
+#define MOZILLA_GFX_2D_SSSE3_SCALER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void ssse3_scale_data(uint32_t *src, int src_width, int src_height,
+                int src_stride,
+                uint32_t *dest, int dest_width, int dest_height,
+                int dest_rowstride,
+                int x, int y,
+                int width, int height);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MOZILLA_GFX_2D_SSS3_SCALER_H_
--- a/gfx/layers/basic/BasicCompositor.cpp
+++ b/gfx/layers/basic/BasicCompositor.cpp
@ -12,6 +12,8 @@
 #include "mozilla/gfx/2D.h"
 #include "mozilla/gfx/Helpers.h"
 #include "mozilla/gfx/Tools.h"
+#include "mozilla/gfx/ssse3-scaler.h"
+#include "mozilla/SSE.h"
 #include "gfxUtils.h"
 #include "YCbCrUtils.h"
 #include <algorithm>
@ -278,6 +280,69 @@ SetupMask(const EffectChain& aEffectChain,
  }
 }

+static bool
+AttemptVideoScale(TextureSourceBasic* aSource, const SourceSurface* aSourceMask,
+                       gfx::Float aOpacity, CompositionOp aBlendMode,
+                       const TexturedEffect* aTexturedEffect,
+                       const Matrix& aNewTransform, const gfx::Rect& aRect,
+                       const gfx::IntRect& aClipRect,
+                       DrawTarget* aDest, const DrawTarget* aBuffer)
+{
+  if (!mozilla::supports_ssse3())
+      return false;
+  if (aNewTransform.IsTranslation()) // unscaled painting should take the regular path
+      return false;
+  if (aNewTransform.HasNonAxisAlignedTransform() || aNewTransform.HasNegativeScaling())
+      return false;
+  if (aSourceMask || aOpacity != 1.0f)
+      return false;
+  if (aBlendMode != CompositionOp::OP_OVER && aBlendMode != CompositionOp::OP_SOURCE)
+      return false;
+
+  IntRect dstRect;
+  // the compiler should know a lot about aNewTransform at this point
+  // maybe it can do some sophisticated optimization of the following
+  if (!aNewTransform.TransformBounds(aRect).ToIntRect(&dstRect))
+      return false;
+
+  if (!(aTexturedEffect->mTextureCoords == Rect(0.0f, 0.0f, 1.0f, 1.0f)))
+      return false;
+  if (aDest->GetFormat() == SurfaceFormat::R5G6B5_UINT16)
+      return false;
+
+  uint8_t* dstData;
+  IntSize dstSize;
+  int32_t dstStride;
+  SurfaceFormat dstFormat;
+  if (aDest->LockBits(&dstData, &dstSize, &dstStride, &dstFormat)) {
+    // If we're not painting to aBuffer the clip will
+    // be applied later
+    IntRect fillRect = dstRect;
+    if (aDest == aBuffer) {
+      // we need to clip fillRect because LockBits ignores the clip on the aDest
+      fillRect = fillRect.Intersect(aClipRect);
+    }
+
+    fillRect = fillRect.Intersect(IntRect(IntPoint(0, 0), aDest->GetSize()));
+    IntPoint offset = fillRect.TopLeft() - dstRect.TopLeft();
+
+    RefPtr<DataSourceSurface> srcSource = aSource->GetSurface(aDest)->GetDataSurface();
+    DataSourceSurface::ScopedMap mapSrc(srcSource, DataSourceSurface::READ);
+
+    ssse3_scale_data((uint32_t*)mapSrc.GetData(), srcSource->GetSize().width, srcSource->GetSize().height,
+                     mapSrc.GetStride()/4,
+                     ((uint32_t*)dstData) + fillRect.x + (dstStride / 4) * fillRect.y, dstRect.width, dstRect.height,
+                     dstStride / 4,
+                     offset.x, offset.y,
+                     fillRect.width, fillRect.height);
+
+    aDest->ReleaseBits(dstData);
+    return true;
+  } else {
+    return false;
+  }
+}
+
 void
 BasicCompositor::DrawQuad(const gfx::Rect& aRect,
                          const gfx::IntRect& aClipRect,
@ -366,12 +431,21 @@ BasicCompositor::DrawQuad(const gfx::Rect& aRect,
      TextureSourceBasic* source = texturedEffect->mTexture->AsSourceBasic();

      if (source && texturedEffect->mPremultiplied) {
+        // we have a fast path for video here
+        if (source->mFromYCBCR &&
+            AttemptVideoScale(source, sourceMask, aOpacity, blendMode,
+                              texturedEffect,
+                              newTransform, aRect, aClipRect - offset,
+                              dest, buffer)) {
+          // we succeeded in scaling
+        } else {
          DrawSurfaceWithTextureCoords(dest, aRect,
                                       source->GetSurface(dest),
                                       texturedEffect->mTextureCoords,
                                       texturedEffect->mFilter,
                                       DrawOptions(aOpacity, blendMode),
                                       sourceMask, &maskTransform);
+        }
      } else if (source) {
        SourceSurface* srcSurf = source->GetSurface(dest);
        if (srcSurf) {