gecko-dev/gfx/qcms/transform-sse1.c

#include <xmmintrin.h>

#include "qcmsint.h"

/* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
#define FLOATSCALE  (float)(PRECACHE_OUTPUT_SIZE)
#define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
static const ALIGN float floatScaleX4[4] =
    { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
static const ALIGN float clampMaxValueX4[4] =
    { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};

void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
                                          unsigned char *src,
                                          unsigned char *dest,
                                          size_t length)
{
    unsigned int i;
    float (*mat)[4] = transform->matrix;
    char input_back[32];
    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
     * because they don't work on stack variables. gcc 4.4 does do the right thing
     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
    /* share input and output locations to save having to keep the
     * locations in separate registers */
    uint32_t const * output = (uint32_t*)input;

    /* deref *transform now to avoid it in loop */
    const float *igtbl_r = transform->input_gamma_table_r;
    const float *igtbl_g = transform->input_gamma_table_g;
    const float *igtbl_b = transform->input_gamma_table_b;

    /* deref *transform now to avoid it in loop */
    const uint8_t *otdata_r = &transform->output_table_r->data[0];
    const uint8_t *otdata_g = &transform->output_table_g->data[0];
    const uint8_t *otdata_b = &transform->output_table_b->data[0];

    /* input matrix values never change */
    const __m128 mat0  = _mm_load_ps(mat[0]);
    const __m128 mat1  = _mm_load_ps(mat[1]);
    const __m128 mat2  = _mm_load_ps(mat[2]);

    /* these values don't change, either */
    const __m128 max   = _mm_load_ps(clampMaxValueX4);
    const __m128 min   = _mm_setzero_ps();
    const __m128 scale = _mm_load_ps(floatScaleX4);

    /* working variables */
    __m128 vec_r, vec_g, vec_b, result;

    /* CYA */
    if (!length)
        return;

    /* one pixel is handled outside of the loop */
    length--;

    /* setup for transforming 1st pixel */
    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
    src += 3;

    /* transform all but final pixel */

    for (i=0; i<length; i++)
    {
        /* position values from gamma tables */
        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);

        /* gamma * matrix */
        vec_r = _mm_mul_ps(vec_r, mat0);
        vec_g = _mm_mul_ps(vec_g, mat1);
        vec_b = _mm_mul_ps(vec_b, mat2);

        /* crunch, crunch, crunch */
        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
        vec_r  = _mm_max_ps(min, vec_r);
        vec_r  = _mm_min_ps(max, vec_r);
        result = _mm_mul_ps(vec_r, scale);

        /* store calc'd output tables indices */
        *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
        result = _mm_movehl_ps(result, result);
        *((__m64 *)&output[2]) = _mm_cvtps_pi32(result) ;

        /* load for next loop while store completes */
        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
        src += 3;

        /* use calc'd indices to output RGB values */
        dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
        dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
        dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
        dest += RGB_OUTPUT_COMPONENTS;
    }

    /* handle final (maybe only) pixel */

    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);

    vec_r = _mm_mul_ps(vec_r, mat0);
    vec_g = _mm_mul_ps(vec_g, mat1);
    vec_b = _mm_mul_ps(vec_b, mat2);

    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
    vec_r  = _mm_max_ps(min, vec_r);
    vec_r  = _mm_min_ps(max, vec_r);
    result = _mm_mul_ps(vec_r, scale);

    *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
    result = _mm_movehl_ps(result, result);
    *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);

    dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
    dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
    dest[OUTPUT_B_INDEX] = otdata_b[output[2]];

    _mm_empty();
}

void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
                                           unsigned char *src,
                                           unsigned char *dest,
                                           size_t length)
{
    unsigned int i;
    float (*mat)[4] = transform->matrix;
    char input_back[32];
    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
     * because they don't work on stack variables. gcc 4.4 does do the right thing
     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
    /* share input and output locations to save having to keep the
     * locations in separate registers */
    uint32_t const * output = (uint32_t*)input;

    /* deref *transform now to avoid it in loop */
    const float *igtbl_r = transform->input_gamma_table_r;
    const float *igtbl_g = transform->input_gamma_table_g;
    const float *igtbl_b = transform->input_gamma_table_b;

    /* deref *transform now to avoid it in loop */
    const uint8_t *otdata_r = &transform->output_table_r->data[0];
    const uint8_t *otdata_g = &transform->output_table_g->data[0];
    const uint8_t *otdata_b = &transform->output_table_b->data[0];

    /* input matrix values never change */
    const __m128 mat0  = _mm_load_ps(mat[0]);
    const __m128 mat1  = _mm_load_ps(mat[1]);
    const __m128 mat2  = _mm_load_ps(mat[2]);

    /* these values don't change, either */
    const __m128 max   = _mm_load_ps(clampMaxValueX4);
    const __m128 min   = _mm_setzero_ps();
    const __m128 scale = _mm_load_ps(floatScaleX4);

    /* working variables */
    __m128 vec_r, vec_g, vec_b, result;
    unsigned char alpha;

    /* CYA */
    if (!length)
        return;

    /* one pixel is handled outside of the loop */
    length--;

    /* setup for transforming 1st pixel */
    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
    alpha = src[3];
    src += 4;

    /* transform all but final pixel */

    for (i=0; i<length; i++)
    {
        /* position values from gamma tables */
        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);

        /* gamma * matrix */
        vec_r = _mm_mul_ps(vec_r, mat0);
        vec_g = _mm_mul_ps(vec_g, mat1);
        vec_b = _mm_mul_ps(vec_b, mat2);

        /* store alpha for this pixel; load alpha for next */
        dest[OUTPUT_A_INDEX] = alpha;
        alpha   = src[3];

        /* crunch, crunch, crunch */
        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
        vec_r  = _mm_max_ps(min, vec_r);
        vec_r  = _mm_min_ps(max, vec_r);
        result = _mm_mul_ps(vec_r, scale);

        /* store calc'd output tables indices */
        *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
        result = _mm_movehl_ps(result, result);
        *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);

        /* load gamma values for next loop while store completes */
        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
        src += 4;

        /* use calc'd indices to output RGB values */
        dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
        dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
        dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
        dest += 4;
    }

    /* handle final (maybe only) pixel */

    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);

    vec_r = _mm_mul_ps(vec_r, mat0);
    vec_g = _mm_mul_ps(vec_g, mat1);
    vec_b = _mm_mul_ps(vec_b, mat2);

    dest[OUTPUT_A_INDEX] = alpha;

    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
    vec_r  = _mm_max_ps(min, vec_r);
    vec_r  = _mm_min_ps(max, vec_r);
    result = _mm_mul_ps(vec_r, scale);

    *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
    result = _mm_movehl_ps(result, result);
    *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);

    dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
    dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
    dest[OUTPUT_B_INDEX] = otdata_b[output[2]];

    _mm_empty();
}
Bug 512865. qcms: Improve SSE2 performance, add SSE support. r=jrmuizel This patch greatly improves the performance of QCMS transformations on x86 & x86_64 systems. Some notes: 0. On 32-bit x86 systems it does runtime selection between non-SIMD, SSE, and SSE2 code paths. 1. On x86_64 systems the SSE2 code path is always taken. The non-SIMD and SSE code paths are left intact, but contemporary versions of the GCC and MSVC compilers will see that they cannot be reached and optimize them away. 2. The execution of the SSE2 code path is reduced by 67%, relative to the original Intel/Microsoft formatted ASM code. The relative performance is seen on a Pentium4 (Northwood) 2.4GHz CPU with DDR1 RAM. 3. The SSE code path provides a 80% reduction in execution time, relative to the non-SIMD code path. The relative performance is seen on a Pentium3 (Coppermine) 1.26GHz CPU with SDRAM. 4. The code has been split out into separate files so that it can be built with different cflags (-msse, and -msse2) when using gcc. 5. Try to land again, this time with __attribute__((__force_align_arg_pointer__)) to avoid crashes on linux. 2009-10-01 18:19:41 +00:00			`#include <xmmintrin.h>`

			`#include "qcmsint.h"`

			`/* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */`
Bug 548612. qcms: Reduce size of output lookup tables. r=bgirard We currently use a larger output lookup table than we probably need. Switch to a common define for the table size and lower it. The should also give a small improvement to startup time because we have fewer lookup table entries to compute. 2010-06-04 18:48:30 +00:00			`#define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE)`
			`#define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )`
Bug 512865. qcms: Improve SSE2 performance, add SSE support. r=jrmuizel This patch greatly improves the performance of QCMS transformations on x86 & x86_64 systems. Some notes: 0. On 32-bit x86 systems it does runtime selection between non-SIMD, SSE, and SSE2 code paths. 1. On x86_64 systems the SSE2 code path is always taken. The non-SIMD and SSE code paths are left intact, but contemporary versions of the GCC and MSVC compilers will see that they cannot be reached and optimize them away. 2. The execution of the SSE2 code path is reduced by 67%, relative to the original Intel/Microsoft formatted ASM code. The relative performance is seen on a Pentium4 (Northwood) 2.4GHz CPU with DDR1 RAM. 3. The SSE code path provides a 80% reduction in execution time, relative to the non-SIMD code path. The relative performance is seen on a Pentium3 (Coppermine) 1.26GHz CPU with SDRAM. 4. The code has been split out into separate files so that it can be built with different cflags (-msse, and -msse2) when using gcc. 5. Try to land again, this time with __attribute__((__force_align_arg_pointer__)) to avoid crashes on linux. 2009-10-01 18:19:41 +00:00			`static const ALIGN float floatScaleX4[4] =`
			`{ FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};`
			`static const ALIGN float clampMaxValueX4[4] =`
			`{ CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};`

			`void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,`
			`unsigned char *src,`
			`unsigned char *dest,`
			`size_t length)`
			`{`
			`unsigned int i;`
			`float (*mat)[4] = transform->matrix;`
			`char input_back[32];`
			`/* Ensure we have a buffer that's 16 byte aligned regardless of the original`
			`* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))`
			`* because they don't work on stack variables. gcc 4.4 does do the right thing`
			`* on x86 but that's too new for us right now. For more info: gcc bug #16660 */`
			`float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);`
			`/* share input and output locations to save having to keep the`
			`* locations in separate registers */`
			`uint32_t const * output = (uint32_t*)input;`

			`/* deref transform now to avoid it in loop /`
			`const float *igtbl_r = transform->input_gamma_table_r;`
			`const float *igtbl_g = transform->input_gamma_table_g;`
			`const float *igtbl_b = transform->input_gamma_table_b;`

			`/* deref transform now to avoid it in loop /`
			`const uint8_t *otdata_r = &transform->output_table_r->data[0];`
			`const uint8_t *otdata_g = &transform->output_table_g->data[0];`
			`const uint8_t *otdata_b = &transform->output_table_b->data[0];`

			`/* input matrix values never change */`
			`const __m128 mat0 = _mm_load_ps(mat[0]);`
			`const __m128 mat1 = _mm_load_ps(mat[1]);`
			`const __m128 mat2 = _mm_load_ps(mat[2]);`

			`/* these values don't change, either */`
			`const __m128 max = _mm_load_ps(clampMaxValueX4);`
			`const __m128 min = _mm_setzero_ps();`
			`const __m128 scale = _mm_load_ps(floatScaleX4);`

			`/* working variables */`
			`__m128 vec_r, vec_g, vec_b, result;`

			`/* CYA */`
			`if (!length)`
			`return;`

			`/* one pixel is handled outside of the loop */`
			`length--;`

			`/* setup for transforming 1st pixel */`
			`vec_r = _mm_load_ss(&igtbl_r[src[0]]);`
			`vec_g = _mm_load_ss(&igtbl_g[src[1]]);`
			`vec_b = _mm_load_ss(&igtbl_b[src[2]]);`
			`src += 3;`

			`/* transform all but final pixel */`

			`for (i=0; i<length; i++)`
			`{`
			`/* position values from gamma tables */`
			`vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);`
			`vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);`
			`vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);`

			`/* gamma * matrix */`
			`vec_r = _mm_mul_ps(vec_r, mat0);`
			`vec_g = _mm_mul_ps(vec_g, mat1);`
			`vec_b = _mm_mul_ps(vec_b, mat2);`

			`/* crunch, crunch, crunch */`
			`vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));`
			`vec_r = _mm_max_ps(min, vec_r);`
			`vec_r = _mm_min_ps(max, vec_r);`
			`result = _mm_mul_ps(vec_r, scale);`

			`/* store calc'd output tables indices */`
			`((__m64 )&output[0]) = _mm_cvtps_pi32(result);`
			`result = _mm_movehl_ps(result, result);`
			`((__m64 )&output[2]) = _mm_cvtps_pi32(result) ;`

			`/* load for next loop while store completes */`
			`vec_r = _mm_load_ss(&igtbl_r[src[0]]);`
			`vec_g = _mm_load_ss(&igtbl_g[src[1]]);`
			`vec_b = _mm_load_ss(&igtbl_b[src[2]]);`
			`src += 3;`

			`/* use calc'd indices to output RGB values */`
Bug 791422. Support more flexibiltiy qcms output format. r=BenWa This will help support Chrome and should also let us output directly to a cairo compatible format. 2012-10-03 23:04:25 +00:00			`dest[OUTPUT_R_INDEX] = otdata_r[output[0]];`
			`dest[OUTPUT_G_INDEX] = otdata_g[output[1]];`
			`dest[OUTPUT_B_INDEX] = otdata_b[output[2]];`
			`dest += RGB_OUTPUT_COMPONENTS;`
Bug 512865. qcms: Improve SSE2 performance, add SSE support. r=jrmuizel This patch greatly improves the performance of QCMS transformations on x86 & x86_64 systems. Some notes: 0. On 32-bit x86 systems it does runtime selection between non-SIMD, SSE, and SSE2 code paths. 1. On x86_64 systems the SSE2 code path is always taken. The non-SIMD and SSE code paths are left intact, but contemporary versions of the GCC and MSVC compilers will see that they cannot be reached and optimize them away. 2. The execution of the SSE2 code path is reduced by 67%, relative to the original Intel/Microsoft formatted ASM code. The relative performance is seen on a Pentium4 (Northwood) 2.4GHz CPU with DDR1 RAM. 3. The SSE code path provides a 80% reduction in execution time, relative to the non-SIMD code path. The relative performance is seen on a Pentium3 (Coppermine) 1.26GHz CPU with SDRAM. 4. The code has been split out into separate files so that it can be built with different cflags (-msse, and -msse2) when using gcc. 5. Try to land again, this time with __attribute__((__force_align_arg_pointer__)) to avoid crashes on linux. 2009-10-01 18:19:41 +00:00			`}`

			`/* handle final (maybe only) pixel */`

			`vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);`
			`vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);`
			`vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);`

			`vec_r = _mm_mul_ps(vec_r, mat0);`
			`vec_g = _mm_mul_ps(vec_g, mat1);`
			`vec_b = _mm_mul_ps(vec_b, mat2);`

			`vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));`
			`vec_r = _mm_max_ps(min, vec_r);`
			`vec_r = _mm_min_ps(max, vec_r);`
			`result = _mm_mul_ps(vec_r, scale);`

			`((__m64 )&output[0]) = _mm_cvtps_pi32(result);`
			`result = _mm_movehl_ps(result, result);`
			`((__m64 )&output[2]) = _mm_cvtps_pi32(result);`

Bug 791422. Support more flexibiltiy qcms output format. r=BenWa This will help support Chrome and should also let us output directly to a cairo compatible format. 2012-10-03 23:04:25 +00:00			`dest[OUTPUT_R_INDEX] = otdata_r[output[0]];`
			`dest[OUTPUT_G_INDEX] = otdata_g[output[1]];`
			`dest[OUTPUT_B_INDEX] = otdata_b[output[2]];`
Bug 512865. qcms: Improve SSE2 performance, add SSE support. r=jrmuizel This patch greatly improves the performance of QCMS transformations on x86 & x86_64 systems. Some notes: 0. On 32-bit x86 systems it does runtime selection between non-SIMD, SSE, and SSE2 code paths. 1. On x86_64 systems the SSE2 code path is always taken. The non-SIMD and SSE code paths are left intact, but contemporary versions of the GCC and MSVC compilers will see that they cannot be reached and optimize them away. 2. The execution of the SSE2 code path is reduced by 67%, relative to the original Intel/Microsoft formatted ASM code. The relative performance is seen on a Pentium4 (Northwood) 2.4GHz CPU with DDR1 RAM. 3. The SSE code path provides a 80% reduction in execution time, relative to the non-SIMD code path. The relative performance is seen on a Pentium3 (Coppermine) 1.26GHz CPU with SDRAM. 4. The code has been split out into separate files so that it can be built with different cflags (-msse, and -msse2) when using gcc. 5. Try to land again, this time with __attribute__((__force_align_arg_pointer__)) to avoid crashes on linux. 2009-10-01 18:19:41 +00:00
			`_mm_empty();`
			`}`

			`void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,`
			`unsigned char *src,`
			`unsigned char *dest,`
			`size_t length)`
			`{`
			`unsigned int i;`
			`float (*mat)[4] = transform->matrix;`
			`char input_back[32];`
			`/* Ensure we have a buffer that's 16 byte aligned regardless of the original`
			`* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))`
			`* because they don't work on stack variables. gcc 4.4 does do the right thing`
			`* on x86 but that's too new for us right now. For more info: gcc bug #16660 */`
			`float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);`
			`/* share input and output locations to save having to keep the`
			`* locations in separate registers */`
			`uint32_t const * output = (uint32_t*)input;`

			`/* deref transform now to avoid it in loop /`
			`const float *igtbl_r = transform->input_gamma_table_r;`
			`const float *igtbl_g = transform->input_gamma_table_g;`
			`const float *igtbl_b = transform->input_gamma_table_b;`

			`/* deref transform now to avoid it in loop /`
			`const uint8_t *otdata_r = &transform->output_table_r->data[0];`
			`const uint8_t *otdata_g = &transform->output_table_g->data[0];`
			`const uint8_t *otdata_b = &transform->output_table_b->data[0];`

			`/* input matrix values never change */`
			`const __m128 mat0 = _mm_load_ps(mat[0]);`
			`const __m128 mat1 = _mm_load_ps(mat[1]);`
			`const __m128 mat2 = _mm_load_ps(mat[2]);`

			`/* these values don't change, either */`
			`const __m128 max = _mm_load_ps(clampMaxValueX4);`
			`const __m128 min = _mm_setzero_ps();`
			`const __m128 scale = _mm_load_ps(floatScaleX4);`

			`/* working variables */`
			`__m128 vec_r, vec_g, vec_b, result;`
			`unsigned char alpha;`

			`/* CYA */`
			`if (!length)`
			`return;`

			`/* one pixel is handled outside of the loop */`
			`length--;`

			`/* setup for transforming 1st pixel */`
			`vec_r = _mm_load_ss(&igtbl_r[src[0]]);`
			`vec_g = _mm_load_ss(&igtbl_g[src[1]]);`
			`vec_b = _mm_load_ss(&igtbl_b[src[2]]);`
			`alpha = src[3];`
			`src += 4;`

			`/* transform all but final pixel */`

			`for (i=0; i<length; i++)`
			`{`
			`/* position values from gamma tables */`
			`vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);`
			`vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);`
			`vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);`

			`/* gamma * matrix */`
			`vec_r = _mm_mul_ps(vec_r, mat0);`
			`vec_g = _mm_mul_ps(vec_g, mat1);`
			`vec_b = _mm_mul_ps(vec_b, mat2);`

			`/* store alpha for this pixel; load alpha for next */`
Bug 791422. Support more flexibiltiy qcms output format. r=BenWa This will help support Chrome and should also let us output directly to a cairo compatible format. 2012-10-03 23:04:25 +00:00			`dest[OUTPUT_A_INDEX] = alpha;`
Bug 512865. qcms: Improve SSE2 performance, add SSE support. r=jrmuizel This patch greatly improves the performance of QCMS transformations on x86 & x86_64 systems. Some notes: 0. On 32-bit x86 systems it does runtime selection between non-SIMD, SSE, and SSE2 code paths. 1. On x86_64 systems the SSE2 code path is always taken. The non-SIMD and SSE code paths are left intact, but contemporary versions of the GCC and MSVC compilers will see that they cannot be reached and optimize them away. 2. The execution of the SSE2 code path is reduced by 67%, relative to the original Intel/Microsoft formatted ASM code. The relative performance is seen on a Pentium4 (Northwood) 2.4GHz CPU with DDR1 RAM. 3. The SSE code path provides a 80% reduction in execution time, relative to the non-SIMD code path. The relative performance is seen on a Pentium3 (Coppermine) 1.26GHz CPU with SDRAM. 4. The code has been split out into separate files so that it can be built with different cflags (-msse, and -msse2) when using gcc. 5. Try to land again, this time with __attribute__((__force_align_arg_pointer__)) to avoid crashes on linux. 2009-10-01 18:19:41 +00:00			`alpha = src[3];`

			`/* crunch, crunch, crunch */`
			`vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));`
			`vec_r = _mm_max_ps(min, vec_r);`
			`vec_r = _mm_min_ps(max, vec_r);`
			`result = _mm_mul_ps(vec_r, scale);`

			`/* store calc'd output tables indices */`
			`((__m64 )&output[0]) = _mm_cvtps_pi32(result);`
			`result = _mm_movehl_ps(result, result);`
			`((__m64 )&output[2]) = _mm_cvtps_pi32(result);`

			`/* load gamma values for next loop while store completes */`
			`vec_r = _mm_load_ss(&igtbl_r[src[0]]);`
			`vec_g = _mm_load_ss(&igtbl_g[src[1]]);`
			`vec_b = _mm_load_ss(&igtbl_b[src[2]]);`
			`src += 4;`

			`/* use calc'd indices to output RGB values */`
Bug 791422. Support more flexibiltiy qcms output format. r=BenWa This will help support Chrome and should also let us output directly to a cairo compatible format. 2012-10-03 23:04:25 +00:00			`dest[OUTPUT_R_INDEX] = otdata_r[output[0]];`
			`dest[OUTPUT_G_INDEX] = otdata_g[output[1]];`
			`dest[OUTPUT_B_INDEX] = otdata_b[output[2]];`
Bug 512865. qcms: Improve SSE2 performance, add SSE support. r=jrmuizel This patch greatly improves the performance of QCMS transformations on x86 & x86_64 systems. Some notes: 0. On 32-bit x86 systems it does runtime selection between non-SIMD, SSE, and SSE2 code paths. 1. On x86_64 systems the SSE2 code path is always taken. The non-SIMD and SSE code paths are left intact, but contemporary versions of the GCC and MSVC compilers will see that they cannot be reached and optimize them away. 2. The execution of the SSE2 code path is reduced by 67%, relative to the original Intel/Microsoft formatted ASM code. The relative performance is seen on a Pentium4 (Northwood) 2.4GHz CPU with DDR1 RAM. 3. The SSE code path provides a 80% reduction in execution time, relative to the non-SIMD code path. The relative performance is seen on a Pentium3 (Coppermine) 1.26GHz CPU with SDRAM. 4. The code has been split out into separate files so that it can be built with different cflags (-msse, and -msse2) when using gcc. 5. Try to land again, this time with __attribute__((__force_align_arg_pointer__)) to avoid crashes on linux. 2009-10-01 18:19:41 +00:00			`dest += 4;`
			`}`

			`/* handle final (maybe only) pixel */`

			`vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);`
			`vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);`
			`vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);`

			`vec_r = _mm_mul_ps(vec_r, mat0);`
			`vec_g = _mm_mul_ps(vec_g, mat1);`
			`vec_b = _mm_mul_ps(vec_b, mat2);`

Bug 791422. Support more flexibiltiy qcms output format. r=BenWa This will help support Chrome and should also let us output directly to a cairo compatible format. 2012-10-03 23:04:25 +00:00			`dest[OUTPUT_A_INDEX] = alpha;`
Bug 512865. qcms: Improve SSE2 performance, add SSE support. r=jrmuizel This patch greatly improves the performance of QCMS transformations on x86 & x86_64 systems. Some notes: 0. On 32-bit x86 systems it does runtime selection between non-SIMD, SSE, and SSE2 code paths. 1. On x86_64 systems the SSE2 code path is always taken. The non-SIMD and SSE code paths are left intact, but contemporary versions of the GCC and MSVC compilers will see that they cannot be reached and optimize them away. 2. The execution of the SSE2 code path is reduced by 67%, relative to the original Intel/Microsoft formatted ASM code. The relative performance is seen on a Pentium4 (Northwood) 2.4GHz CPU with DDR1 RAM. 3. The SSE code path provides a 80% reduction in execution time, relative to the non-SIMD code path. The relative performance is seen on a Pentium3 (Coppermine) 1.26GHz CPU with SDRAM. 4. The code has been split out into separate files so that it can be built with different cflags (-msse, and -msse2) when using gcc. 5. Try to land again, this time with __attribute__((__force_align_arg_pointer__)) to avoid crashes on linux. 2009-10-01 18:19:41 +00:00
			`vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));`
			`vec_r = _mm_max_ps(min, vec_r);`
			`vec_r = _mm_min_ps(max, vec_r);`
			`result = _mm_mul_ps(vec_r, scale);`

			`((__m64 )&output[0]) = _mm_cvtps_pi32(result);`
			`result = _mm_movehl_ps(result, result);`
			`((__m64 )&output[2]) = _mm_cvtps_pi32(result);`

Bug 791422. Support more flexibiltiy qcms output format. r=BenWa This will help support Chrome and should also let us output directly to a cairo compatible format. 2012-10-03 23:04:25 +00:00			`dest[OUTPUT_R_INDEX] = otdata_r[output[0]];`
			`dest[OUTPUT_G_INDEX] = otdata_g[output[1]];`
			`dest[OUTPUT_B_INDEX] = otdata_b[output[2]];`
Bug 512865. qcms: Improve SSE2 performance, add SSE support. r=jrmuizel This patch greatly improves the performance of QCMS transformations on x86 & x86_64 systems. Some notes: 0. On 32-bit x86 systems it does runtime selection between non-SIMD, SSE, and SSE2 code paths. 1. On x86_64 systems the SSE2 code path is always taken. The non-SIMD and SSE code paths are left intact, but contemporary versions of the GCC and MSVC compilers will see that they cannot be reached and optimize them away. 2. The execution of the SSE2 code path is reduced by 67%, relative to the original Intel/Microsoft formatted ASM code. The relative performance is seen on a Pentium4 (Northwood) 2.4GHz CPU with DDR1 RAM. 3. The SSE code path provides a 80% reduction in execution time, relative to the non-SIMD code path. The relative performance is seen on a Pentium3 (Coppermine) 1.26GHz CPU with SDRAM. 4. The code has been split out into separate files so that it can be built with different cflags (-msse, and -msse2) when using gcc. 5. Try to land again, this time with __attribute__((__force_align_arg_pointer__)) to avoid crashes on linux. 2009-10-01 18:19:41 +00:00
			`_mm_empty();`
			`}`