Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>

Originally committed as revision 274 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Nick Kurshev 2002-01-20 14:48:02 +00:00
parent 4bdd9157cc
commit 1e98dffb7a
9 changed files with 769 additions and 2 deletions

View File

@ -37,6 +37,12 @@ OBJS += mlib/dsputil_mlib.o
CFLAGS += $(MLIB_INC)
endif
# alpha specific stuff
ifeq ($(TARGET_ARCH_ALPHA),yes)
OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o
CFLAGS += -Wa,-mpca56
endif
SRCS = $(OBJS:.o=.c) $(ASM_OBJS:.o=.s)
LIB= libavcodec.a
@ -74,6 +80,7 @@ clean:
rm -f *.o *~ $(LIB) $(SLIB) *.so i386/*.o i386/*~ \
armv4l/*.o armv4l/*~ \
mlib/*.o mlib/*~ \
alpha/*.o alpha/*~ \
libac3/*.o libac3/*~ \
apiexample $(TESTS)

141
libavcodec/alpha/asm.h Normal file
View File

@ -0,0 +1,141 @@
/*
* Alpha optimized DSP utils
* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef LIBAVCODEC_ALPHA_ASM_H
#define LIBAVCODEC_ALPHA_ASM_H
#include <stdint.h>
#define AMASK_BWX (1 << 0)
#define AMASK_FIX (1 << 1)
#define AMASK_MVI (1 << 8)
static inline uint64_t BYTE_VEC(uint64_t x)
{
x |= x << 8;
x |= x << 16;
x |= x << 32;
return x;
}
static inline uint64_t WORD_VEC(uint64_t x)
{
x |= x << 16;
x |= x << 32;
return x;
}
static inline int32_t ldl(const void* p)
{
return *(const int32_t*) p;
}
static inline uint64_t ldq(const void* p)
{
return *(const uint64_t*) p;
}
/* FIXME ccc doesn't seem to get it? Use inline asm? */
static inline uint64_t ldq_u(const void* p)
{
return *(const uint64_t*) ((uintptr_t) p & ~7ul);
}
static inline void stl(uint32_t l, void* p)
{
*(uint32_t*) p = l;
}
static inline void stq(uint64_t l, void* p)
{
*(uint64_t*) p = l;
}
#ifdef __GNUC__
#define OPCODE1(name) \
static inline uint64_t name(uint64_t l) \
{ \
uint64_t r; \
asm (#name " %1, %0" : "=r" (r) : "r" (l)); \
return r; \
}
#define OPCODE2(name) \
static inline uint64_t name(uint64_t l1, uint64_t l2) \
{ \
uint64_t r; \
asm (#name " %1, %2, %0" : "=r" (r) : "r" (l1), "rI" (l2)); \
return r; \
}
/* We don't want gcc to move this around or combine it with another
rpcc, so mark it volatile. */
static inline uint64_t rpcc(void)
{
uint64_t r;
asm volatile ("rpcc %0" : "=r" (r));
return r;
}
static inline uint64_t uldq(const void* v)
{
struct foo {
unsigned long l;
} __attribute__((packed));
return ((const struct foo*) v)->l;
}
#elif defined(__DECC) /* Compaq "ccc" compiler */
#include <c_asm.h>
#define OPCODE1(name) \
static inline uint64_t name(uint64_t l) \
{ \
return asm (#name " %a0, %v0", l); \
}
#define OPCODE2(name) \
static inline uint64_t name(uint64_t l1, uint64_t l2) \
{ \
return asm (#name " %a0, %a1, %v0", l1, l2); \
}
static inline uint64_t rpcc(void)
{
return asm ("rpcc %v0");
}
static inline uint64_t uldq(const void* v)
{
return *(const __unaligned uint64_t *) v;
}
#endif
OPCODE1(amask);
OPCODE1(unpkbw);
OPCODE1(pkwb);
OPCODE2(extql);
OPCODE2(extqh);
OPCODE2(zap);
OPCODE2(cmpbge);
OPCODE2(minsw4);
OPCODE2(minuw4);
OPCODE2(minub8);
OPCODE2(maxsw4);
OPCODE2(maxuw4);
OPCODE2(perr);
#endif /* LIBAVCODEC_ALPHA_ASM_H */

View File

@ -0,0 +1,223 @@
/*
* Alpha optimized DSP utils
* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "asm.h"
#include "../dsputil.h"
void simple_idct_axp(DCTELEM *block);
static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels,
int line_size)
{
int i = 8;
do {
UINT64 shorts;
shorts = ldq(block);
shorts = maxsw4(shorts, 0);
shorts = minsw4(shorts, WORD_VEC(0x00ff));
stl(pkwb(shorts), pixels);
shorts = ldq(block + 4);
shorts = maxsw4(shorts, 0);
shorts = minsw4(shorts, WORD_VEC(0x00ff));
stl(pkwb(shorts), pixels + 4);
pixels += line_size;
block += 8;
} while (--i);
}
static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels,
int line_size)
{
int i = 8;
do {
UINT64 shorts;
shorts = ldq(block);
shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */
shorts += unpkbw(ldl(pixels));
shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */
shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */
shorts &= ~WORD_VEC(0x4000); /* ...and zap them */
shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */
stl(pkwb(shorts), pixels);
/* next 4 */
shorts = ldq(block + 4);
shorts &= ~WORD_VEC(0x8000);
shorts += unpkbw(ldl(pixels + 4));
shorts &= ~WORD_VEC(0x8000);
shorts = minuw4(shorts, WORD_VEC(0x4000));
shorts &= ~WORD_VEC(0x4000);
shorts = minsw4(shorts, WORD_VEC(0x00ff));
stl(pkwb(shorts), pixels + 4);
pixels += line_size;
block += 8;
} while (--i);
}
/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
Since the immediate result could be greater than 255, we do the
shift first. The result is too low by one if the bytes were both
odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */
static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2)
{
UINT64 correction = (l1 & l2) & BYTE_VEC(0x01);
l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
return l1 + l2 + correction;
}
/* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1
The '1' only has an effect when one byte is even and the other odd,
i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01).
Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01). */
static inline UINT64 avg2(UINT64 l1, UINT64 l2)
{
UINT64 correction = (l1 | l2) & BYTE_VEC(0x01);
l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
return l1 + l2 + correction;
}
static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
{
UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+ ((l2 & ~BYTE_VEC(0x03)) >> 2)
+ ((l3 & ~BYTE_VEC(0x03)) >> 2)
+ ((l4 & ~BYTE_VEC(0x03)) >> 2);
UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
+ (l2 & BYTE_VEC(0x03))
+ (l3 & BYTE_VEC(0x03))
+ (l4 & BYTE_VEC(0x03))
+ BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
return r1 + r2;
}
static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
{
UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+ ((l2 & ~BYTE_VEC(0x03)) >> 2)
+ ((l3 & ~BYTE_VEC(0x03)) >> 2)
+ ((l4 & ~BYTE_VEC(0x03)) >> 2);
UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
+ (l2 & BYTE_VEC(0x03))
+ (l3 & BYTE_VEC(0x03))
+ (l4 & BYTE_VEC(0x03))
+ BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
return r1 + r2;
}
#define PIXOPNAME(suffix) put ## suffix
#define BTYPE UINT8
#define AVG2 avg2
#define AVG4 avg4
#define STORE(l, b) stq(l, b)
#include "pixops.h"
#undef PIXOPNAME
#undef BTYPE
#undef AVG2
#undef AVG4
#undef STORE
#define PIXOPNAME(suffix) put_no_rnd ## suffix
#define BTYPE UINT8
#define AVG2 avg2_no_rnd
#define AVG4 avg4_no_rnd
#define STORE(l, b) stq(l, b)
#include "pixops.h"
#undef PIXOPNAME
#undef BTYPE
#undef AVG2
#undef AVG4
#undef STORE
/* The following functions are untested. */
#if 0
#define PIXOPNAME(suffix) avg ## suffix
#define BTYPE UINT8
#define AVG2 avg2
#define AVG4 avg4
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
#include "pixops.h"
#undef PIXOPNAME
#undef BTYPE
#undef AVG2
#undef AVG4
#undef STORE
#define PIXOPNAME(suffix) avg_no_rnd ## suffix
#define BTYPE UINT8
#define AVG2 avg2_no_rnd
#define AVG4 avg4_no_rnd
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
#include "pixops.h"
#undef PIXOPNAME
#undef BTYPE
#undef AVG2
#undef AVG4
#undef STORE
#define PIXOPNAME(suffix) sub ## suffix
#define BTYPE DCTELEM
#define AVG2 avg2
#define AVG4 avg4
#define STORE(l, block) do { \
UINT64 xxx = l; \
(block)[0] -= (xxx >> 0) & 0xff; \
(block)[1] -= (xxx >> 8) & 0xff; \
(block)[2] -= (xxx >> 16) & 0xff; \
(block)[3] -= (xxx >> 24) & 0xff; \
(block)[4] -= (xxx >> 32) & 0xff; \
(block)[5] -= (xxx >> 40) & 0xff; \
(block)[6] -= (xxx >> 48) & 0xff; \
(block)[7] -= (xxx >> 56) & 0xff; \
} while (0)
#include "pixops.h"
#undef PIXOPNAME
#undef BTYPE
#undef AVG2
#undef AVG4
#undef STORE
#endif
void dsputil_init_alpha(void)
{
put_pixels_tab[0] = put_pixels_axp;
put_pixels_tab[1] = put_pixels_x2_axp;
put_pixels_tab[2] = put_pixels_y2_axp;
put_pixels_tab[3] = put_pixels_xy2_axp;
put_no_rnd_pixels_tab[0] = put_pixels_axp;
put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
/* amask clears all bits that correspond to present features. */
if (amask(AMASK_MVI) == 0) {
fprintf(stderr, "MVI extension detected\n");
put_pixels_clamped = put_pixels_clamped_axp;
add_pixels_clamped = add_pixels_clamped_axp;
}
}

View File

@ -0,0 +1,88 @@
/*
* Alpha optimized DSP utils
* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "asm.h"
#include "../dsputil.h"
#include "../mpegvideo.h"
extern UINT8 zigzag_end[64];
static void dct_unquantize_h263_axp(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
int i, level;
UINT64 qmul, qadd;
if (s->mb_intra) {
if (n < 4)
block[0] = block[0] * s->y_dc_scale;
else
block[0] = block[0] * s->c_dc_scale;
/* Catch up to aligned point. */
qmul = s->qscale << 1;
qadd = (s->qscale - 1) | 1;
for (i = 1; i < 4; ++i) {
level = block[i];
if (level) {
if (level < 0) {
level = level * qmul - qadd;
} else {
level = level * qmul + qadd;
}
block[i] = level;
}
}
block += 4;
i = 60 / 4;
} else {
i = zigzag_end[s->block_last_index[n]] / 4;
}
qmul = s->qscale << 1;
qadd = WORD_VEC((qscale - 1) | 1);
do {
UINT64 levels, negmask, zeromask, corr;
levels = ldq(block);
if (levels == 0)
continue;
zeromask = cmpbge(0, levels);
zeromask &= zeromask >> 1;
/* Negate all negative words. */
negmask = maxsw4(levels, WORD_VEC(0xffff)); /* negative -> ffff (-1) */
negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */
corr = negmask & WORD_VEC(0x0001); /* twos-complement correction */
levels ^= negmask;
levels += corr;
levels = levels * qmul;
levels += zap(qadd, zeromask);
/* Re-negate negative words. */
levels -= corr;
levels ^= negmask;
stq(levels, block);
} while (block += 4, --i);
}
void MPV_common_init_axp(MpegEncContext *s)
{
if (amask(AMASK_MVI) == 0) {
if (s->out_format == FMT_H263)
s->dct_unquantize = dct_unquantize_h263_axp;
}
}

135
libavcodec/alpha/pixops.h Normal file
View File

@ -0,0 +1,135 @@
/*
* Alpha optimized DSP utils
* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/* This file is intended to be #included with proper definitions of
* PIXOPNAME, BTYPE, AVG2, AVG4 and STORE. */
static void PIXOPNAME(_pixels_axp)(BTYPE *block, const UINT8 *pixels,
int line_size, int h)
{
if ((size_t) pixels & 0x7) {
do {
STORE(uldq(pixels), block);
pixels += line_size;
block += line_size;
} while (--h);
} else {
do {
STORE(ldq(pixels), block);
pixels += line_size;
block += line_size;
} while (--h);
}
}
static void PIXOPNAME(_pixels_x2_axp)(BTYPE *block, const UINT8 *pixels,
int line_size, int h)
{
if ((size_t) pixels & 0x7) {
do {
UINT64 pix1, pix2;
pix1 = uldq(pixels);
pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
STORE(AVG2(pix1, pix2), block);
pixels += line_size;
block += line_size;
} while (--h);
} else {
do {
UINT64 pix1, pix2;
pix1 = ldq(pixels);
pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
STORE(AVG2(pix1, pix2), block);
pixels += line_size;
block += line_size;
} while (--h);
}
}
static void PIXOPNAME(_pixels_y2_axp)(BTYPE *block, const UINT8 *pixels,
int line_size, int h)
{
if ((size_t) pixels & 0x7) {
UINT64 pix = uldq(pixels);
do {
UINT64 next_pix;
pixels += line_size;
next_pix = uldq(pixels);
STORE(AVG2(pix, next_pix), block);
block += line_size;
pix = next_pix;
} while (--h);
} else {
UINT64 pix = ldq(pixels);
do {
UINT64 next_pix;
pixels += line_size;
next_pix = ldq(pixels);
STORE(AVG2(pix, next_pix), block);
block += line_size;
pix = next_pix;
} while (--h);
}
}
/* This could be further sped up by recycling AVG4 intermediate
results from the previous loop pass. */
static void PIXOPNAME(_pixels_xy2_axp)(BTYPE *block, const UINT8 *pixels,
int line_size, int h)
{
if ((size_t) pixels & 0x7) {
UINT64 pix1 = uldq(pixels);
UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
do {
UINT64 next_pix1, next_pix2;
pixels += line_size;
next_pix1 = uldq(pixels);
next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56);
STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block);
block += line_size;
pix1 = next_pix1;
pix2 = next_pix2;
} while (--h);
} else {
UINT64 pix1 = ldq(pixels);
UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
do {
UINT64 next_pix1, next_pix2;
pixels += line_size;
next_pix1 = ldq(pixels);
next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56);
STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block);
block += line_size;
pix1 = next_pix1;
pix2 = next_pix2;
} while (--h);
}
}

View File

@ -497,6 +497,10 @@ void dsputil_init(void)
dsputil_init_mlib();
use_permuted_idct = 0;
#endif
#ifdef ARCH_ALPHA
dsputil_init_alpha();
use_permuted_idct = 0;
#endif
#ifdef SIMPLE_IDCT
if(ff_idct == simple_idct) use_permuted_idct=0;

View File

@ -123,6 +123,13 @@ void dsputil_init_armv4l(void);
void dsputil_init_mlib(void);
#elif defined(ARCH_ALPHA)
#define emms_c()
#define __align8 __attribute__ ((aligned (8)))
void dsputil_init_alpha(void);
#else
#define emms_c()

View File

@ -460,7 +460,19 @@ static int msmpeg4_pred_dc(MpegEncContext * s, int n,
: "r" (scale)
: "%eax", "%edx"
);
#else
#elif defined (ARCH_ALPHA)
/* Divisions are extremely costly on Alpha; optimize the most
common case. */
if (scale == 8) {
a = (a + (8 >> 1)) / 8;
b = (b + (8 >> 1)) / 8;
c = (c + (8 >> 1)) / 8;
} else {
a = (a + (scale >> 1)) / scale;
b = (b + (scale >> 1)) / scale;
c = (c + (scale >> 1)) / scale;
}
#else
a = (a + (scale >> 1)) / scale;
b = (b + (scale >> 1)) / scale;
c = (c + (scale >> 1)) / scale;

View File

@ -23,6 +23,7 @@
#include <inttypes.h>
#include "simple_idct.h"
#include "../config.h"
#if 0
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
@ -102,6 +103,107 @@ static int inline idctRowCondZ (int16_t * row)
return 1;
}
#ifdef ARCH_ALPHA
static int inline idctRowCondDC(int16_t *row)
{
int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
uint64_t *lrow = (uint64_t *) row;
if (lrow[1] == 0) {
if (lrow[0] == 0)
return 0;
if ((lrow[0] & ~0xffffULL) == 0) {
uint64_t v;
a0 = W4 * row[0];
a0 += 1 << (ROW_SHIFT - 1);
a0 >>= ROW_SHIFT;
v = (uint16_t) a0;
v += v << 16;
v += v << 32;
lrow[0] = v;
lrow[1] = v;
return 1;
}
}
a0 = W4 * row[0];
a1 = W4 * row[0];
a2 = W4 * row[0];
a3 = W4 * row[0];
if (row[2]) {
a0 += W2 * row[2];
a1 += W6 * row[2];
a2 -= W6 * row[2];
a3 -= W2 * row[2];
}
if (row[4]) {
a0 += W4 * row[4];
a1 -= W4 * row[4];
a2 -= W4 * row[4];
a3 += W4 * row[4];
}
if (row[6]) {
a0 += W6 * row[6];
a1 -= W2 * row[6];
a2 += W2 * row[6];
a3 -= W6 * row[6];
}
a0 += 1 << (ROW_SHIFT - 1);
a1 += 1 << (ROW_SHIFT - 1);
a2 += 1 << (ROW_SHIFT - 1);
a3 += 1 << (ROW_SHIFT - 1);
if (row[1]) {
b0 = W1 * row[1];
b1 = W3 * row[1];
b2 = W5 * row[1];
b3 = W7 * row[1];
} else {
b0 = 0;
b1 = 0;
b2 = 0;
b3 = 0;
}
if (row[3]) {
b0 += W3 * row[3];
b1 -= W7 * row[3];
b2 -= W1 * row[3];
b3 -= W5 * row[3];
}
if (row[5]) {
b0 += W5 * row[5];
b1 -= W1 * row[5];
b2 += W7 * row[5];
b3 += W3 * row[5];
}
if (row[7]) {
b0 += W7 * row[7];
b1 -= W5 * row[7];
b2 += W3 * row[7];
b3 -= W1 * row[7];
}
row[0] = (a0 + b0) >> ROW_SHIFT;
row[1] = (a1 + b1) >> ROW_SHIFT;
row[2] = (a2 + b2) >> ROW_SHIFT;
row[3] = (a3 + b3) >> ROW_SHIFT;
row[4] = (a3 - b3) >> ROW_SHIFT;
row[5] = (a2 - b2) >> ROW_SHIFT;
row[6] = (a1 - b1) >> ROW_SHIFT;
row[7] = (a0 - b0) >> ROW_SHIFT;
return 1;
}
#else /* not ARCH_ALPHA */
static int inline idctRowCondDC (int16_t * row)
{
int a0, a1, a2, a3, b0, b1, b2, b3;
@ -147,6 +249,7 @@ static int inline idctRowCondDC (int16_t * row)
return 1;
}
#endif /* not ARCH_ALPHA */
static void inline idctCol (int16_t * col)
{
@ -243,6 +346,7 @@ static void inline idctSparseCol (int16_t * col)
b3 += - W1*col[8*7];
}
#ifndef ARCH_ALPHA
if(!(b0|b1|b2|b3)){
col[8*0] = (a0) >> COL_SHIFT;
col[8*7] = (a0) >> COL_SHIFT;
@ -253,6 +357,7 @@ static void inline idctSparseCol (int16_t * col)
col[8*3] = (a3) >> COL_SHIFT;
col[8*4] = (a3) >> COL_SHIFT;
}else{
#endif
col[8*0] = (a0 + b0) >> COL_SHIFT;
col[8*7] = (a0 - b0) >> COL_SHIFT;
col[8*1] = (a1 + b1) >> COL_SHIFT;
@ -261,7 +366,9 @@ static void inline idctSparseCol (int16_t * col)
col[8*5] = (a2 - b2) >> COL_SHIFT;
col[8*3] = (a3 + b3) >> COL_SHIFT;
col[8*4] = (a3 - b3) >> COL_SHIFT;
#ifndef ARCH_ALPHA
}
#endif
}
static void inline idctSparse2Col (int16_t * col)
@ -337,6 +444,34 @@ static void inline idctSparse2Col (int16_t * col)
col[8*4] = (a3 - b3) >> COL_SHIFT;
}
#ifdef ARCH_ALPHA
/* If all rows but the first one are zero after row transformation,
all rows will be identical after column transformation. */
static inline void idctCol2(int16_t *col)
{
int i;
uint64_t l, r;
uint64_t *lcol = (uint64_t *) col;
for (i = 0; i < 8; ++i) {
int a0 = col[0] + (1 << (COL_SHIFT - 1)) / W4;
a0 *= W4;
col[0] = a0 >> COL_SHIFT;
++col;
}
l = lcol[0];
r = lcol[1];
lcol[ 2] = l; lcol[ 3] = r;
lcol[ 4] = l; lcol[ 5] = r;
lcol[ 6] = l; lcol[ 7] = r;
lcol[ 8] = l; lcol[ 9] = r;
lcol[10] = l; lcol[11] = r;
lcol[12] = l; lcol[13] = r;
lcol[14] = l; lcol[15] = r;
}
#endif
void simple_idct (short *block)
{
@ -411,7 +546,22 @@ void simple_idct (short *block)
for(i=0; i<8; i++)
idctSparse2Col(block + i);
}
#else
#elif defined(ARCH_ALPHA)
int shortcut = 1;
for (i = 0; i < 8; i++) {
int anynonzero = idctRowCondDC(block + 8 * i);
if (i > 0 && anynonzero)
shortcut = 0;
}
if (shortcut) {
idctCol2(block);
} else {
for (i = 0; i < 8; i++)
idctSparseCol(block + i);
}
#else
for(i=0; i<8; i++)
idctRowCondDC(block + i*8);