From 404d2241ec55bc6048eeb7c09bc7cdb248ecf632 Mon Sep 17 00:00:00 2001 From: Brian Foley Date: Tue, 26 Nov 2002 09:21:01 +0000 Subject: [PATCH] altivec accelerated v-resample patch by (Brian Foley ) Originally committed as revision 1283 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/dsputil.h | 4 ++ libavcodec/imgresample.c | 136 ++++++++++++++++++++++++++++++++++- libavcodec/ppc/dsputil_ppc.c | 4 ++ 3 files changed, 143 insertions(+), 1 deletion(-) diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 0b1d34f908..29aca1ac22 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -190,6 +190,10 @@ void dsputil_init_alpha(DSPContext* c, unsigned mask); #elif defined(ARCH_POWERPC) +#define MM_ALTIVEC 0x0001 /* standard AltiVec */ + +extern int mm_flags; + #define __align8 __attribute__ ((aligned (16))) void dsputil_init_ppc(DSPContext* c, unsigned mask); diff --git a/libavcodec/imgresample.c b/libavcodec/imgresample.c index 1197f858bc..630f2eb2c6 100644 --- a/libavcodec/imgresample.c +++ b/libavcodec/imgresample.c @@ -22,7 +22,7 @@ #ifdef USE_FASTMEMCPY #include "fastmemcpy.h" #endif - +extern int mm_flags; #define NB_COMPONENTS 3 @@ -264,6 +264,133 @@ static void v_resample4_mmx(UINT8 *dst, int dst_width, UINT8 *src, int wrap, } #endif +#ifdef HAVE_ALTIVEC +typedef union { + vector unsigned char v; + unsigned char c[16]; +} vec_uc_t; + +typedef union { + vector signed short v; + signed short s[8]; +} vec_ss_t; + +void v_resample16_altivec(UINT8 *dst, int dst_width, UINT8 *src, int wrap, + INT16 *filter) +{ + int sum, i; + uint8_t *s; + vector unsigned char *tv, tmp, dstv, zero; + vec_ss_t srchv[4], srclv[4], fv[4]; + vector signed short zeros, sumhv, sumlv; + s = src; + + for(i=0;i<4;i++) + { + /* + The vec_madds later on does an implicit >>15 on the result. + Since FILTER_BITS is 8, and we have 15 bits of magnitude in + a signed short, we have just enough bits to pre-shift our + filter constants <<7 to compensate for vec_madds. + */ + fv[i].s[0] = filter[i] << (15-FILTER_BITS); + fv[i].v = vec_splat(fv[i].v, 0); + } + + zero = vec_splat_u8(0); + zeros = vec_splat_s16(0); + + + /* + When we're resampling, we'd ideally like both our input buffers, + and output buffers to be 16-byte aligned, so we can do both aligned + reads and writes. Sadly we can't always have this at the moment, so + we opt for aligned writes, as unaligned writes have a huge overhead. + To do this, do enough scalar resamples to get dst 16-byte aligned. + */ + i = (16-((int)dst) & 0xf) & 0xf; + while(i>0) { + sum = s[0 * wrap] * filter[0] + + s[1 * wrap] * filter[1] + + s[2 * wrap] * filter[2] + + s[3 * wrap] * filter[3]; + sum = sum >> FILTER_BITS; + if (sum<0) sum = 0; else if (sum>255) sum=255; + dst[0] = sum; + dst++; + s++; + dst_width--; + i--; + } + + /* Do our altivec resampling on 16 pixels at once. */ + while(dst_width>=16) { + /* + Read 16 (potentially unaligned) bytes from each of + 4 lines into 4 vectors, and split them into shorts. + Interleave the multipy/accumulate for the resample + filter with the loads to hide the 3 cycle latency + the vec_madds have. + */ + tv = (vector unsigned char *) &s[0 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); + srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[0].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[0].v, fv[0].v, zeros); + sumlv = vec_madds(srclv[0].v, fv[0].v, zeros); + + tv = (vector unsigned char *) &s[1 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap])); + srchv[1].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[1].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv); + sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv); + + tv = (vector unsigned char *) &s[2 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap])); + srchv[2].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[2].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv); + sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv); + + tv = (vector unsigned char *) &s[3 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap])); + srchv[3].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[3].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); + sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); + + /* + Pack the results into our destination vector, + and do an aligned write of that back to memory. + */ + dstv = vec_packsu(sumhv, sumlv) ; + vec_st(dstv, 0, (vector unsigned char *) dst); + + dst+=16; + s+=16; + dst_width-=16; + } + + /* + If there are any leftover pixels, resample them + with the slow scalar method. + */ + while(dst_width>0) { + sum = s[0 * wrap] * filter[0] + + s[1 * wrap] * filter[1] + + s[2 * wrap] * filter[2] + + s[3 * wrap] * filter[3]; + sum = sum >> FILTER_BITS; + if (sum<0) sum = 0; else if (sum>255) sum=255; + dst[0] = sum; + dst++; + s++; + dst_width--; + } +} +#endif + /* slow version to handle limit cases. Does not need optimisation */ static void h_resample_slow(UINT8 *dst, int dst_width, UINT8 *src, int src_width, int src_start, int src_incr, INT16 *filters) @@ -383,6 +510,13 @@ static void component_resample(ImgReSampleContext *s, s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth, &s->v_filters[phase_y][0]); else +#endif +#ifdef HAVE_ALTIVEC + if ((mm_flags & MM_ALTIVEC) && NB_TAPS == 4 && FILTER_BITS == 8) + v_resample16_altivec(output, owidth, + s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth, + &s->v_filters[phase_y][0]); + else #endif v_resample(output, owidth, s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth, diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c index c4dae6cb3b..ffe3ce0633 100644 --- a/libavcodec/ppc/dsputil_ppc.c +++ b/libavcodec/ppc/dsputil_ppc.c @@ -23,6 +23,8 @@ #include "dsputil_altivec.h" #endif +int mm_flags = 0; + void dsputil_init_ppc(DSPContext* c, unsigned mask) { // Common optimisations whether Altivec or not @@ -31,6 +33,8 @@ void dsputil_init_ppc(DSPContext* c, unsigned mask) #if HAVE_ALTIVEC if (has_altivec()) { + mm_flags |= MM_ALTIVEC; + // Altivec specific optimisations c->pix_abs16x16_x2 = pix_abs16x16_x2_altivec; c->pix_abs16x16_y2 = pix_abs16x16_y2_altivec;