Cosmetics: should not hurt performance, scream if are

Originally committed as revision 5493 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Luca Barbato 2006-06-17 18:46:06 +00:00
parent a97c1e93aa
commit e8772eecdc
2 changed files with 386 additions and 284 deletions

View File

@ -1311,9 +1311,9 @@ POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
int sum;
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
{
register const_vector signed short vprod1 = (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
register const_vector signed short vprod2 = (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
@ -1338,6 +1338,8 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
{ \
register vector unsigned char src1, src2, srcO; \
register vector unsigned char dst1, dst2, dstO; \
register vector signed short srcV, dstV; \
register vector signed short but0, but1, but2, op1, op2, op3; \
src1 = vec_ld(stride * i, src); \
if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
src2 = vec_ld((stride * i) + 16, src); \
@ -1348,17 +1350,19 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
/* promote the unsigned chars to signed shorts */ \
/* we're in the 8x8 function, we only care for the first 8 */ \
register vector signed short srcV = \
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
register vector signed short dstV = \
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
srcV = \
(vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)srcO); \
dstV = \
(vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)dstO); \
/* substractions inside the first butterfly */ \
register vector signed short but0 = vec_sub(srcV, dstV); \
register vector signed short op1 = vec_perm(but0, but0, perm1); \
register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
register vector signed short op2 = vec_perm(but1, but1, perm2); \
register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
register vector signed short op3 = vec_perm(but2, but2, perm3); \
but0 = vec_sub(srcV, dstV); \
op1 = vec_perm(but0, but0, perm1); \
but1 = vec_mladd(but0, vprod1, op1); \
op2 = vec_perm(but1, but1, perm2); \
but2 = vec_mladd(but1, vprod2, op2); \
op3 = vec_perm(but2, but2, perm3); \
res = vec_mladd(but2, vprod3, op3); \
}
ONEITERBUTTERFLY(0, temp0);
@ -1481,37 +1485,63 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
#define ONEITERBUTTERFLY(i, res1, res2) \
{ \
register vector unsigned char src1 REG_v(v22), src2 REG_v(v23); \
register vector unsigned char dst1 REG_v(v24), dst2 REG_v(v25); \
register vector unsigned char src1 REG_v(v22), \
src2 REG_v(v23), \
dst1 REG_v(v24), \
dst2 REG_v(v25), \
srcO REG_v(v22), \
dstO REG_v(v23); \
\
register vector signed short srcV REG_v(v24), \
dstV REG_v(v25), \
srcW REG_v(v26), \
dstW REG_v(v27), \
but0 REG_v(v28), \
but0S REG_v(v29), \
op1 REG_v(v30), \
but1 REG_v(v22), \
op1S REG_v(v23), \
but1S REG_v(v24), \
op2 REG_v(v25), \
but2 REG_v(v26), \
op2S REG_v(v27), \
but2S REG_v(v28), \
op3 REG_v(v29), \
op3S REG_v(v30); \
\
src1 = vec_ld(stride * i, src); \
src2 = vec_ld((stride * i) + 16, src); \
register vector unsigned char srcO REG_v(v22) = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
dst1 = vec_ld(stride * i, dst); \
dst2 = vec_ld((stride * i) + 16, dst); \
register vector unsigned char dstO REG_v(v23) = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
/* promote the unsigned chars to signed shorts */ \
register vector signed short srcV REG_v(v24) = \
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
register vector signed short dstV REG_v(v25) = \
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
register vector signed short srcW REG_v(v26) = \
(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
register vector signed short dstW REG_v(v27) = \
(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
srcV = \
(vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)srcO); \
dstV = \
(vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)dstO); \
srcW = \
(vector signed short)vec_mergel((vector signed char)vzero, \
(vector signed char)srcO); \
dstW = \
(vector signed short)vec_mergel((vector signed char)vzero, \
(vector signed char)dstO); \
/* substractions inside the first butterfly */ \
register vector signed short but0 REG_v(v28) = vec_sub(srcV, dstV); \
register vector signed short but0S REG_v(v29) = vec_sub(srcW, dstW); \
register vector signed short op1 REG_v(v30) = vec_perm(but0, but0, perm1); \
register vector signed short but1 REG_v(v22) = vec_mladd(but0, vprod1, op1); \
register vector signed short op1S REG_v(v23) = vec_perm(but0S, but0S, perm1); \
register vector signed short but1S REG_v(v24) = vec_mladd(but0S, vprod1, op1S); \
register vector signed short op2 REG_v(v25) = vec_perm(but1, but1, perm2); \
register vector signed short but2 REG_v(v26) = vec_mladd(but1, vprod2, op2); \
register vector signed short op2S REG_v(v27) = vec_perm(but1S, but1S, perm2); \
register vector signed short but2S REG_v(v28) = vec_mladd(but1S, vprod2, op2S); \
register vector signed short op3 REG_v(v29) = vec_perm(but2, but2, perm3); \
but0 = vec_sub(srcV, dstV); \
but0S = vec_sub(srcW, dstW); \
op1 = vec_perm(but0, but0, perm1); \
but1 = vec_mladd(but0, vprod1, op1); \
op1S = vec_perm(but0S, but0S, perm1); \
but1S = vec_mladd(but0S, vprod1, op1S); \
op2 = vec_perm(but1, but1, perm2); \
but2 = vec_mladd(but1, vprod2, op2); \
op2S = vec_perm(but1S, but1S, perm2); \
but2S = vec_mladd(but1S, vprod2, op2S); \
op3 = vec_perm(but2, but2, perm3); \
res1 = vec_mladd(but2, vprod3, op3); \
register vector signed short op3S REG_v(v30) = vec_perm(but2S, but2S, perm3); \
op3S = vec_perm(but2S, but2S, perm3); \
res2 = vec_mladd(but2S, vprod3, op3S); \
}
ONEITERBUTTERFLY(0, temp0, temp0S);
@ -1526,6 +1556,12 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
#undef ONEITERBUTTERFLY
{
register vector signed int vsum;
register vector signed short line0S, line1S, line2S, line3S, line4S,
line5S, line6S, line7S, line0BS,line2BS,
line1BS,line3BS,line4BS,line6BS,line5BS,
line7BS,line0CS,line4CS,line1CS,line5CS,
line2CS,line6CS,line3CS,line7CS;
register vector signed short line0 = vec_add(temp0, temp1);
register vector signed short line1 = vec_sub(temp0, temp1);
register vector signed short line2 = vec_add(temp2, temp3);
@ -1562,32 +1598,32 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
vsum = vec_sum4s(vec_abs(line6C), vsum);
vsum = vec_sum4s(vec_abs(line7C), vsum);
register vector signed short line0S = vec_add(temp0S, temp1S);
register vector signed short line1S = vec_sub(temp0S, temp1S);
register vector signed short line2S = vec_add(temp2S, temp3S);
register vector signed short line3S = vec_sub(temp2S, temp3S);
register vector signed short line4S = vec_add(temp4S, temp5S);
register vector signed short line5S = vec_sub(temp4S, temp5S);
register vector signed short line6S = vec_add(temp6S, temp7S);
register vector signed short line7S = vec_sub(temp6S, temp7S);
line0S = vec_add(temp0S, temp1S);
line1S = vec_sub(temp0S, temp1S);
line2S = vec_add(temp2S, temp3S);
line3S = vec_sub(temp2S, temp3S);
line4S = vec_add(temp4S, temp5S);
line5S = vec_sub(temp4S, temp5S);
line6S = vec_add(temp6S, temp7S);
line7S = vec_sub(temp6S, temp7S);
register vector signed short line0BS = vec_add(line0S, line2S);
register vector signed short line2BS = vec_sub(line0S, line2S);
register vector signed short line1BS = vec_add(line1S, line3S);
register vector signed short line3BS = vec_sub(line1S, line3S);
register vector signed short line4BS = vec_add(line4S, line6S);
register vector signed short line6BS = vec_sub(line4S, line6S);
register vector signed short line5BS = vec_add(line5S, line7S);
register vector signed short line7BS = vec_sub(line5S, line7S);
line0BS = vec_add(line0S, line2S);
line2BS = vec_sub(line0S, line2S);
line1BS = vec_add(line1S, line3S);
line3BS = vec_sub(line1S, line3S);
line4BS = vec_add(line4S, line6S);
line6BS = vec_sub(line4S, line6S);
line5BS = vec_add(line5S, line7S);
line7BS = vec_sub(line5S, line7S);
register vector signed short line0CS = vec_add(line0BS, line4BS);
register vector signed short line4CS = vec_sub(line0BS, line4BS);
register vector signed short line1CS = vec_add(line1BS, line5BS);
register vector signed short line5CS = vec_sub(line1BS, line5BS);
register vector signed short line2CS = vec_add(line2BS, line6BS);
register vector signed short line6CS = vec_sub(line2BS, line6BS);
register vector signed short line3CS = vec_add(line3BS, line7BS);
register vector signed short line7CS = vec_sub(line3BS, line7BS);
line0CS = vec_add(line0BS, line4BS);
line4CS = vec_sub(line0BS, line4BS);
line1CS = vec_add(line1BS, line5BS);
line5CS = vec_sub(line1BS, line5BS);
line2CS = vec_add(line2BS, line6BS);
line6CS = vec_sub(line2BS, line6BS);
line3CS = vec_add(line3BS, line7BS);
line7CS = vec_sub(line3BS, line7BS);
vsum = vec_sum4s(vec_abs(line0CS), vsum);
vsum = vec_sum4s(vec_abs(line1CS), vsum);

View File

@ -19,13 +19,13 @@
/* this code assume that stride % 16 == 0 */
void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
signed int ABCD[4] __attribute__((aligned(16)));
signed int ABCD[4] __attribute__((aligned(16))) =
{((8 - x) * (8 - y)),
((x) * (8 - y)),
((8 - x) * (y)),
((x) * (y))};
register int i;
ABCD[0] = ((8 - x) * (8 - y));
ABCD[1] = ((x) * (8 - y));
ABCD[2] = ((8 - x) * (y));
ABCD[3] = ((x) * (y));
vector unsigned char fperm;
const vector signed int vABCD = vec_ld(0, ABCD);
const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
@ -34,55 +34,61 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
const vector signed int vzero = vec_splat_s32(0);
const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
const vector unsigned short v6us = vec_splat_u16(6);
vector unsigned char fperm;
if (((unsigned long)dst) % 16 == 0) {
fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
} else {
fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
}
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
vector unsigned char vsrcAuc;
vector unsigned char vsrcBuc;
vector unsigned char vsrcperm0;
vector unsigned char vsrcperm1;
vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
vector unsigned char vsrc0uc, vsrc1uc;
vector signed short vsrc0ssH, vsrc1ssH;
vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
vector signed short vsrc2ssH, vsrc3ssH, psum;
vector unsigned char vdst, ppsum, vfdst, fsum;
POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
if (((unsigned long)dst) % 16 == 0) {
fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F);
} else {
fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F);
}
vsrcAuc = vec_ld(0, src);
if (loadSecond)
vsrcBuc = vec_ld(16, src);
vsrcperm0 = vec_lvsl(0, src);
vsrcperm1 = vec_lvsl(1, src);
vector unsigned char vsrc0uc;
vector unsigned char vsrc1uc;
vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
if (reallyBadAlign)
vsrc1uc = vsrcBuc;
else
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
vector signed short vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc0uc);
vector signed short vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc1uc);
vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc0uc);
vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc1uc);
if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) {
vector unsigned char vsrcCuc;
vsrcCuc = vec_ld(stride + 0, src);
vector unsigned char vsrc2uc;
vector unsigned char vsrc3uc;
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc);
vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc);
vector signed short psum;
vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc2uc);
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc3uc);
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
psum = vec_mladd(vB, vsrc1ssH, psum);
@ -91,11 +97,9 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
psum = vec_add(v32ss, psum);
psum = vec_sra(psum, v6us);
vector unsigned char vdst = vec_ld(0, dst);
vector unsigned char ppsum = (vector unsigned char)vec_packsu(psum, psum);
vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm);
vector unsigned char fsum;
vdst = vec_ld(0, dst);
ppsum = (vector unsigned char)vec_packsu(psum, psum);
vfdst = vec_perm(vdst, ppsum, fperm);
OP_U8_ALTIVEC(fsum, vfdst, vdst);
@ -108,24 +112,21 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
src += stride;
}
} else {
for (i = 0 ; i < h ; i++) {
vector unsigned char vsrcCuc;
vector unsigned char vsrcDuc;
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src);
vsrcDuc = vec_ld(stride + 16, src);
vector unsigned char vsrc2uc;
vector unsigned char vsrc3uc;
vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
if (reallyBadAlign)
vsrc3uc = vsrcDuc;
else
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc);
vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc);
vector signed short psum;
vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc2uc);
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc3uc);
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
psum = vec_mladd(vB, vsrc1ssH, psum);
@ -134,11 +135,9 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
psum = vec_add(v32ss, psum);
psum = vec_sr(psum, v6us);
vector unsigned char vdst = vec_ld(0, dst);
vector unsigned char ppsum = (vector unsigned char)vec_pack(psum, psum);
vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm);
vector unsigned char fsum;
vdst = vec_ld(0, dst);
ppsum = (vector unsigned char)vec_pack(psum, psum);
vfdst = vec_perm(vdst, ppsum, fperm);
OP_U8_ALTIVEC(fsum, vfdst, vdst);
@ -157,7 +156,6 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
/* this code assume stride % 16 == 0 */
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
register int i;
const vector signed int vzero = vec_splat_s32(0);
@ -172,13 +170,30 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
const vector unsigned char dstperm = vec_lvsr(0, dst);
const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
const vector unsigned char neg1 =
(const vector unsigned char) vec_splat_s8(-1);
const vector unsigned char dstmask =
vec_perm((const vector unsigned char)vzero,
neg1, dstperm);
vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
register int align = ((((unsigned long)src) - 2) % 16);
vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB;
vector unsigned char sum, dst1, dst2, vdst, fsum,
rsum, fdst1, fdst2;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
for (i = 0 ; i < 16 ; i ++) {
vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
vector unsigned char srcR1 = vec_ld(-2, src);
vector unsigned char srcR2 = vec_ld(14, src);
@ -237,55 +252,54 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
} break;
}
const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
srcP0A = vec_mergeh((vector unsigned char)vzero, srcP0);
srcP0B = vec_mergel((vector unsigned char)vzero, srcP0);
srcP1A = vec_mergeh((vector unsigned char)vzero, srcP1);
srcP1B = vec_mergel((vector unsigned char)vzero, srcP1);
const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
srcP2A = vec_mergeh((vector unsigned char)vzero, srcP2);
srcP2B = vec_mergel((vector unsigned char)vzero, srcP2);
srcP3A = vec_mergeh((vector unsigned char)vzero, srcP3);
srcP3B = vec_mergel((vector unsigned char)vzero, srcP3);
const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
srcM1A = vec_mergeh((vector unsigned char)vzero, srcM1);
srcM1B = vec_mergel((vector unsigned char)vzero, srcM1);
srcM2A = vec_mergeh((vector unsigned char)vzero, srcM2);
srcM2B = vec_mergel((vector unsigned char)vzero, srcM2);
const vector signed short sum1A = vec_adds(srcP0A, srcP1A);
const vector signed short sum1B = vec_adds(srcP0B, srcP1B);
const vector signed short sum2A = vec_adds(srcM1A, srcP2A);
const vector signed short sum2B = vec_adds(srcM1B, srcP2B);
const vector signed short sum3A = vec_adds(srcM2A, srcP3A);
const vector signed short sum3B = vec_adds(srcM2B, srcP3B);
sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B);
sum2A = vec_adds(srcM1A, srcP2A);
sum2B = vec_adds(srcM1B, srcP2B);
sum3A = vec_adds(srcM2A, srcP3A);
sum3B = vec_adds(srcM2B, srcP3B);
const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss);
const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss);
pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss);
const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
const vector signed short pp3A = vec_add(sum3A, pp1A);
const vector signed short pp3B = vec_add(sum3B, pp1B);
pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B);
const vector signed short psumA = vec_sub(pp3A, pp2A);
const vector signed short psumB = vec_sub(pp3B, pp2B);
psumA = vec_sub(pp3A, pp2A);
psumB = vec_sub(pp3B, pp2B);
const vector signed short sumA = vec_sra(psumA, v5us);
const vector signed short sumB = vec_sra(psumB, v5us);
sumA = vec_sra(psumA, v5us);
sumB = vec_sra(psumB, v5us);
const vector unsigned char sum = vec_packsu(sumA, sumB);
sum = vec_packsu(sumA, sumB);
const vector unsigned char dst1 = vec_ld(0, dst);
const vector unsigned char dst2 = vec_ld(16, dst);
const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
dst1 = vec_ld(0, dst);
dst2 = vec_ld(16, dst);
vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
vector unsigned char fsum;
OP_U8_ALTIVEC(fsum, sum, vdst);
const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
rsum = vec_perm(fsum, fsum, dstperm);
fdst1 = vec_sel(dst1, rsum, dstmask);
fdst2 = vec_sel(rsum, dst2, dstmask);
vec_st(fdst1, 0, dst);
vec_st(fdst2, 16, dst);
@ -299,7 +313,6 @@ POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
/* this code assume stride % 16 == 0 */
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
register int i;
@ -318,49 +331,71 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
const vector unsigned char srcM2a = vec_ld(0, srcbis);
const vector unsigned char srcM2b = vec_ld(16, srcbis);
const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
srcbis += srcStride;
const vector unsigned char srcM1a = vec_ld(0, srcbis);
// srcbis += srcStride;
const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
const vector unsigned char srcM1b = vec_ld(16, srcbis);
const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
srcbis += srcStride;
const vector unsigned char srcP0a = vec_ld(0, srcbis);
// srcbis += srcStride;
const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
const vector unsigned char srcP0b = vec_ld(16, srcbis);
const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
srcbis += srcStride;
const vector unsigned char srcP1a = vec_ld(0, srcbis);
// srcbis += srcStride;
const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
const vector unsigned char srcP1b = vec_ld(16, srcbis);
const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
srcbis += srcStride;
const vector unsigned char srcP2a = vec_ld(0, srcbis);
// srcbis += srcStride;
const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
const vector unsigned char srcP2b = vec_ld(16, srcbis);
const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
srcbis += srcStride;
// srcbis += srcStride;
vector signed short srcM2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
vector signed short srcM2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
vector signed short srcM1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
vector signed short srcM1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
vector signed short srcP0ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
vector signed short srcP0ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
vector signed short srcP1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
vector signed short srcP1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
vector signed short srcP2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
vector signed short srcP2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
vector signed short srcM2ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM2);
vector signed short srcM2ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM2);
vector signed short srcM1ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM1);
vector signed short srcM1ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM1);
vector signed short srcP0ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP0);
vector signed short srcP0ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP0);
vector signed short srcP1ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP1);
vector signed short srcP1ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP1);
vector signed short srcP2ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP2);
vector signed short srcP2ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP2);
vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB,
srcP3ssA, srcP3ssB,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
srcP3a, srcP3b, srcP3;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
for (i = 0 ; i < 16 ; i++) {
const vector unsigned char srcP3a = vec_ld(0, srcbis);
const vector unsigned char srcP3b = vec_ld(16, srcbis);
const vector unsigned char srcP3 = vec_perm(srcP3a, srcP3b, perm);
const vector signed short srcP3ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
const vector signed short srcP3ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
srcbis += srcStride;
srcP3a = vec_ld(0, srcbis += srcStride);
srcP3b = vec_ld(16, srcbis);
srcP3 = vec_perm(srcP3a, srcP3b, perm);
srcP3ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP3);
srcP3ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP3);
// srcbis += srcStride;
const vector signed short sum1A = vec_adds(srcP0ssA, srcP1ssA);
const vector signed short sum1B = vec_adds(srcP0ssB, srcP1ssB);
const vector signed short sum2A = vec_adds(srcM1ssA, srcP2ssA);
const vector signed short sum2B = vec_adds(srcM1ssB, srcP2ssB);
const vector signed short sum3A = vec_adds(srcM2ssA, srcP3ssA);
const vector signed short sum3B = vec_adds(srcM2ssB, srcP3ssB);
sum1A = vec_adds(srcP0ssA, srcP1ssA);
sum1B = vec_adds(srcP0ssB, srcP1ssB);
sum2A = vec_adds(srcM1ssA, srcP2ssA);
sum2B = vec_adds(srcM1ssB, srcP2ssB);
sum3A = vec_adds(srcM2ssA, srcP3ssA);
sum3B = vec_adds(srcM2ssB, srcP3ssB);
srcM2ssA = srcM1ssA;
srcM2ssB = srcM1ssB;
@ -373,33 +408,32 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP2ssA = srcP3ssA;
srcP2ssB = srcP3ssB;
const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss);
const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss);
pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss);
const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
const vector signed short pp3A = vec_add(sum3A, pp1A);
const vector signed short pp3B = vec_add(sum3B, pp1B);
pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B);
const vector signed short psumA = vec_sub(pp3A, pp2A);
const vector signed short psumB = vec_sub(pp3B, pp2B);
psumA = vec_sub(pp3A, pp2A);
psumB = vec_sub(pp3B, pp2B);
const vector signed short sumA = vec_sra(psumA, v5us);
const vector signed short sumB = vec_sra(psumB, v5us);
sumA = vec_sra(psumA, v5us);
sumB = vec_sra(psumB, v5us);
const vector unsigned char sum = vec_packsu(sumA, sumB);
sum = vec_packsu(sumA, sumB);
const vector unsigned char dst1 = vec_ld(0, dst);
const vector unsigned char dst2 = vec_ld(16, dst);
const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
dst1 = vec_ld(0, dst);
dst2 = vec_ld(16, dst);
vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
vector unsigned char fsum;
OP_U8_ALTIVEC(fsum, sum, vdst);
const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
rsum = vec_perm(fsum, fsum, dstperm);
fdst1 = vec_sel(dst1, rsum, dstmask);
fdst2 = vec_sel(rsum, dst2, dstmask);
vec_st(fdst1, 0, dst);
vec_st(fdst2, 16, dst);
@ -412,7 +446,6 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
register int i;
const vector signed int vzero = vec_splat_s32(0);
const vector unsigned char permM2 = vec_lvsl(-2, src);
@ -430,8 +463,38 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
register int align = ((((unsigned long)src) - 2) % 16);
src -= (2 * srcStride);
const vector unsigned char neg1 = (const vector unsigned char)
vec_splat_s8(-1);
vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, psumA, psumB;
const vector unsigned char dstperm = vec_lvsr(0, dst);
const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
const vector unsigned char mperm = (const vector unsigned char)
AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
int16_t *tmpbis = tmp;
vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
tmpP2ssA, tmpP2ssB;
vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
ssumAe, ssumAo, ssumBe, ssumBo;
vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
rsum, fdst1, fdst2;
vector signed short ssume, ssumo;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
src -= (2 * srcStride);
for (i = 0 ; i < 21 ; i ++) {
vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
vector unsigned char srcR1 = vec_ld(-2, src);
@ -492,36 +555,48 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
} break;
}
const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
srcP0A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP0);
srcP0B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP0);
srcP1A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP1);
srcP1B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP1);
const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
srcP2A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP2);
srcP2B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP2);
srcP3A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP3);
srcP3B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP3);
const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
srcM1A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM1);
srcM1B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM1);
srcM2A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM2);
srcM2B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM2);
const vector signed short sum1A = vec_adds(srcP0A, srcP1A);
const vector signed short sum1B = vec_adds(srcP0B, srcP1B);
const vector signed short sum2A = vec_adds(srcM1A, srcP2A);
const vector signed short sum2B = vec_adds(srcM1B, srcP2B);
const vector signed short sum3A = vec_adds(srcM2A, srcP3A);
const vector signed short sum3B = vec_adds(srcM2B, srcP3B);
sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B);
sum2A = vec_adds(srcM1A, srcP2A);
sum2B = vec_adds(srcM1B, srcP2B);
sum3A = vec_adds(srcM2A, srcP3A);
sum3B = vec_adds(srcM2B, srcP3B);
const vector signed short pp1A = vec_mladd(sum1A, v20ss, sum3A);
const vector signed short pp1B = vec_mladd(sum1B, v20ss, sum3B);
pp1A = vec_mladd(sum1A, v20ss, sum3A);
pp1B = vec_mladd(sum1B, v20ss, sum3B);
const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
const vector signed short psumA = vec_sub(pp1A, pp2A);
const vector signed short psumB = vec_sub(pp1B, pp2B);
psumA = vec_sub(pp1A, pp2A);
psumB = vec_sub(pp1B, pp2B);
vec_st(psumA, 0, tmp);
vec_st(psumB, 16, tmp);
@ -530,35 +605,25 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
}
const vector unsigned char dstperm = vec_lvsr(0, dst);
const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
const vector unsigned char mperm = (const vector unsigned char)
AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
int16_t *tmpbis = tmp - (tmpStride * 21);
vector signed short tmpM2ssA = vec_ld(0, tmpbis);
vector signed short tmpM2ssB = vec_ld(16, tmpbis);
tmpM2ssA = vec_ld(0, tmpbis);
tmpM2ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
vector signed short tmpM1ssA = vec_ld(0, tmpbis);
vector signed short tmpM1ssB = vec_ld(16, tmpbis);
tmpM1ssA = vec_ld(0, tmpbis);
tmpM1ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
vector signed short tmpP0ssA = vec_ld(0, tmpbis);
vector signed short tmpP0ssB = vec_ld(16, tmpbis);
tmpP0ssA = vec_ld(0, tmpbis);
tmpP0ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
vector signed short tmpP1ssA = vec_ld(0, tmpbis);
vector signed short tmpP1ssB = vec_ld(16, tmpbis);
tmpP1ssA = vec_ld(0, tmpbis);
tmpP1ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
vector signed short tmpP2ssA = vec_ld(0, tmpbis);
vector signed short tmpP2ssB = vec_ld(16, tmpbis);
tmpP2ssA = vec_ld(0, tmpbis);
tmpP2ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
for (i = 0 ; i < 16 ; i++) {
const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
@ -567,6 +632,8 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
tmpbis += tmpStride;
tmpM2ssA = tmpM1ssA;
tmpM2ssB = tmpM1ssB;
tmpM1ssA = tmpP0ssA;
@ -578,57 +645,56 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
tmpP2ssA = tmpP3ssA;
tmpP2ssB = tmpP3ssB;
const vector signed int pp1Ae = vec_mule(sum1A, v20ss);
const vector signed int pp1Ao = vec_mulo(sum1A, v20ss);
const vector signed int pp1Be = vec_mule(sum1B, v20ss);
const vector signed int pp1Bo = vec_mulo(sum1B, v20ss);
pp1Ae = vec_mule(sum1A, v20ss);
pp1Ao = vec_mulo(sum1A, v20ss);
pp1Be = vec_mule(sum1B, v20ss);
pp1Bo = vec_mulo(sum1B, v20ss);
const vector signed int pp2Ae = vec_mule(sum2A, v5ss);
const vector signed int pp2Ao = vec_mulo(sum2A, v5ss);
const vector signed int pp2Be = vec_mule(sum2B, v5ss);
const vector signed int pp2Bo = vec_mulo(sum2B, v5ss);
pp2Ae = vec_mule(sum2A, v5ss);
pp2Ao = vec_mulo(sum2A, v5ss);
pp2Be = vec_mule(sum2B, v5ss);
pp2Bo = vec_mulo(sum2B, v5ss);
const vector signed int pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
const vector signed int pp3Ao = vec_mulo(sum3A, v1ss);
const vector signed int pp3Be = vec_sra((vector signed int)sum3B, v16ui);
const vector signed int pp3Bo = vec_mulo(sum3B, v1ss);
pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
pp3Ao = vec_mulo(sum3A, v1ss);
pp3Be = vec_sra((vector signed int)sum3B, v16ui);
pp3Bo = vec_mulo(sum3B, v1ss);
const vector signed int pp1cAe = vec_add(pp1Ae, v512si);
const vector signed int pp1cAo = vec_add(pp1Ao, v512si);
const vector signed int pp1cBe = vec_add(pp1Be, v512si);
const vector signed int pp1cBo = vec_add(pp1Bo, v512si);
pp1cAe = vec_add(pp1Ae, v512si);
pp1cAo = vec_add(pp1Ao, v512si);
pp1cBe = vec_add(pp1Be, v512si);
pp1cBo = vec_add(pp1Bo, v512si);
const vector signed int pp32Ae = vec_sub(pp3Ae, pp2Ae);
const vector signed int pp32Ao = vec_sub(pp3Ao, pp2Ao);
const vector signed int pp32Be = vec_sub(pp3Be, pp2Be);
const vector signed int pp32Bo = vec_sub(pp3Bo, pp2Bo);
pp32Ae = vec_sub(pp3Ae, pp2Ae);
pp32Ao = vec_sub(pp3Ao, pp2Ao);
pp32Be = vec_sub(pp3Be, pp2Be);
pp32Bo = vec_sub(pp3Bo, pp2Bo);
const vector signed int sumAe = vec_add(pp1cAe, pp32Ae);
const vector signed int sumAo = vec_add(pp1cAo, pp32Ao);
const vector signed int sumBe = vec_add(pp1cBe, pp32Be);
const vector signed int sumBo = vec_add(pp1cBo, pp32Bo);
sumAe = vec_add(pp1cAe, pp32Ae);
sumAo = vec_add(pp1cAo, pp32Ao);
sumBe = vec_add(pp1cBe, pp32Be);
sumBo = vec_add(pp1cBo, pp32Bo);
const vector signed int ssumAe = vec_sra(sumAe, v10ui);
const vector signed int ssumAo = vec_sra(sumAo, v10ui);
const vector signed int ssumBe = vec_sra(sumBe, v10ui);
const vector signed int ssumBo = vec_sra(sumBo, v10ui);
ssumAe = vec_sra(sumAe, v10ui);
ssumAo = vec_sra(sumAo, v10ui);
ssumBe = vec_sra(sumBe, v10ui);
ssumBo = vec_sra(sumBo, v10ui);
const vector signed short ssume = vec_packs(ssumAe, ssumBe);
const vector signed short ssumo = vec_packs(ssumAo, ssumBo);
ssume = vec_packs(ssumAe, ssumBe);
ssumo = vec_packs(ssumAo, ssumBo);
const vector unsigned char sumv = vec_packsu(ssume, ssumo);
const vector unsigned char sum = vec_perm(sumv, sumv, mperm);
sumv = vec_packsu(ssume, ssumo);
sum = vec_perm(sumv, sumv, mperm);
const vector unsigned char dst1 = vec_ld(0, dst);
const vector unsigned char dst2 = vec_ld(16, dst);
const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
dst1 = vec_ld(0, dst);
dst2 = vec_ld(16, dst);
vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
vector unsigned char fsum;
OP_U8_ALTIVEC(fsum, sum, vdst);
const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
rsum = vec_perm(fsum, fsum, dstperm);
fdst1 = vec_sel(dst1, rsum, dstmask);
fdst2 = vec_sel(rsum, dst2, dstmask);
vec_st(fdst1, 0, dst);
vec_st(fdst2, 16, dst);