Bug 498770 - Enable optimized Theora code in Windows builds - r=kinetik rs=roc

This commit is contained in:
David Schleef 2009-06-19 15:03:45 +12:00
parent 47d0907360
commit fd39c3ad32
6 changed files with 122 additions and 4 deletions

View File

@ -3,6 +3,7 @@ source distribution using the update.sh script. The changes made were
those applied by update.sh, the addition/update of Makefile.in files those applied by update.sh, the addition/update of Makefile.in files
for the Mozilla build system and the patch in bug below. for the Mozilla build system and the patch in bug below.
bug498770.patch - Enable optimized theora code in windows build
Bug 455357 - WinCE LibTheora Pre-defined Macro usage in local variable Bug 455357 - WinCE LibTheora Pre-defined Macro usage in local variable
455357_wince_local_variable_macro_clash_patch 455357_wince_local_variable_macro_clash_patch
This patch is needed for building WinCE / WinMobile because the This patch is needed for building WinCE / WinMobile because the

View File

@ -0,0 +1,97 @@
Index: lib/dec/x86_vc/mmxfrag.c
===================================================================
--- lib/dec/x86_vc/mmxfrag.c (revision 16142)
+++ lib/dec/x86_vc/mmxfrag.c (working copy)
@@ -27,12 +27,14 @@
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
const ogg_int16_t *_residue){
+ int _save_ebx;
/* ---------------------------------------------------------------------
This function does the inter reconstruction step with 8 iterations
unrolled. The iteration for each instruction is noted by the #id in the
comments (in case you want to reconstruct it)
--------------------------------------------------------------------- */
_asm{
+ mov [_save_ebx], ebx
mov edi, [_residue] /* load residue ptr */
mov eax, 0x00800080 /* generate constant */
mov ebx, [_dst_ystride] /* load dst-stride */
@@ -93,6 +95,7 @@
packuswb mm3, mm4 /* #8 pack to byte */
movq [edx + ecx*2], mm1 /* #7 write row */
movq [edx + eax], mm3 /* #8 write row */
+ mov ebx, [_save_ebx]
}
}
@@ -100,6 +103,7 @@
void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride,
const unsigned char *_src, int _src_ystride, const ogg_int16_t *_residue){
+ int _save_ebx;
/* ---------------------------------------------------------------------
This function does the inter reconstruction step with two iterations
running in parallel to hide some load-latencies and break the dependency
@@ -107,6 +111,7 @@
comments (in case you want to reconstruct it)
--------------------------------------------------------------------- */
_asm{
+ mov [_save_ebx], ebx
pxor mm0, mm0 /* generate constant 0 */
mov esi, [_src]
mov edi, [_residue]
@@ -143,6 +148,7 @@
movq [edx + ebx], mm7 /* #2 write row */
lea edx, [edx+ebx*2] /* dst += stride * 2 */
jne nextchunk
+ mov ebx, [_save_ebx]
}
}
@@ -150,6 +156,7 @@
void oc_frag_recon_inter2_mmx(unsigned char *_dst, int _dst_ystride,
const unsigned char *_src1, int _src1_ystride, const unsigned char *_src2,
int _src2_ystride,const ogg_int16_t *_residue){
+ int _save_ebx;
/* ---------------------------------------------------------------------
This function does the inter2 reconstruction step.The building of the
average is done with a bit-twiddeling trick to avoid excessive register
@@ -166,6 +173,7 @@
using the pavgb instruction let me know and I'll do the 3dnow codepath.
--------------------------------------------------------------------- */
_asm{
+ mov [_save_ebx], ebx
mov eax, 0xfefefefe
mov esi, [_src1]
mov edi, [_src2]
@@ -204,6 +212,7 @@
packuswb mm2, mm3 /* pack and saturate */
movq [edx], mm2 /* write row */
jne nextrow
+ mov ebx, [_save_ebx]
}
}
Index: lib/dec/x86_vc/mmxloopfilter.c
===================================================================
--- lib/dec/x86_vc/mmxloopfilter.c (revision 16142)
+++ lib/dec/x86_vc/mmxloopfilter.c (working copy)
@@ -38,7 +38,7 @@
_asm {
mov eax, [_pix]
mov edx, [_ystride]
- mov ebx, [_ll]
+ mov ecx, [_ll]
/* _pix -= ystride */
sub eax, edx
@@ -104,7 +104,7 @@
/*Free up mm5.*/
packuswb mm4, mm5
/*mm0=L L L L*/
- movq mm0, [ebx]
+ movq mm0, [ecx]
/*if(R_i<-2L||R_i>2L)R_i=0:*/
movq mm5, mm2
pxor mm6, mm6

View File

@ -48,12 +48,10 @@ FORCE_STATIC_LIB= 1
DEFINES += -DTHEORA_DISABLE_ENCODE DEFINES += -DTHEORA_DISABLE_ENCODE
ifeq ($(findstring 86,$(OS_TEST)), 86) ifeq ($(findstring 86,$(OS_TEST)), 86)
ifneq ($(OS_ARCH),WINNT)
ifneq ($(OS_ARCH),SunOS) ifneq ($(OS_ARCH),SunOS)
DEFINES += -DOC_X86ASM -DUSE_ASM DEFINES += -DOC_X86ASM -DUSE_ASM
endif endif
endif endif
endif
VPATH := $(srcdir) $(srcdir)/dec VPATH := $(srcdir) $(srcdir)/dec
@ -75,6 +73,17 @@ CSRCS = \
$(NULL) $(NULL)
ifeq ($(findstring 86,$(OS_TEST)), 86) ifeq ($(findstring 86,$(OS_TEST)), 86)
ifeq ($(OS_ARCH),WINNT)
VPATH += $(srcdir)/dec/x86_vc
CSRCS += \
mmxfrag.c \
mmxloopfilter.c \
x86state.c \
mmxstate.c \
mmxidct.c \
$(NULL)
else
VPATH += $(srcdir)/dec/x86 VPATH += $(srcdir)/dec/x86
CSRCS += \ CSRCS += \
@ -84,6 +93,7 @@ CSRCS += \
mmxidct.c \ mmxidct.c \
$(NULL) $(NULL)
endif endif
endif
include $(topsrcdir)/config/rules.mk include $(topsrcdir)/config/rules.mk

View File

@ -27,12 +27,14 @@
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride, void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
const ogg_int16_t *_residue){ const ogg_int16_t *_residue){
int _save_ebx;
/* --------------------------------------------------------------------- /* ---------------------------------------------------------------------
This function does the inter reconstruction step with 8 iterations This function does the inter reconstruction step with 8 iterations
unrolled. The iteration for each instruction is noted by the #id in the unrolled. The iteration for each instruction is noted by the #id in the
comments (in case you want to reconstruct it) comments (in case you want to reconstruct it)
--------------------------------------------------------------------- */ --------------------------------------------------------------------- */
_asm{ _asm{
mov [_save_ebx], ebx
mov edi, [_residue] /* load residue ptr */ mov edi, [_residue] /* load residue ptr */
mov eax, 0x00800080 /* generate constant */ mov eax, 0x00800080 /* generate constant */
mov ebx, [_dst_ystride] /* load dst-stride */ mov ebx, [_dst_ystride] /* load dst-stride */
@ -93,6 +95,7 @@ void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
packuswb mm3, mm4 /* #8 pack to byte */ packuswb mm3, mm4 /* #8 pack to byte */
movq [edx + ecx*2], mm1 /* #7 write row */ movq [edx + ecx*2], mm1 /* #7 write row */
movq [edx + eax], mm3 /* #8 write row */ movq [edx + eax], mm3 /* #8 write row */
mov ebx, [_save_ebx]
} }
} }
@ -100,6 +103,7 @@ void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride, void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride,
const unsigned char *_src, int _src_ystride, const ogg_int16_t *_residue){ const unsigned char *_src, int _src_ystride, const ogg_int16_t *_residue){
int _save_ebx;
/* --------------------------------------------------------------------- /* ---------------------------------------------------------------------
This function does the inter reconstruction step with two iterations This function does the inter reconstruction step with two iterations
running in parallel to hide some load-latencies and break the dependency running in parallel to hide some load-latencies and break the dependency
@ -107,6 +111,7 @@ void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride,
comments (in case you want to reconstruct it) comments (in case you want to reconstruct it)
--------------------------------------------------------------------- */ --------------------------------------------------------------------- */
_asm{ _asm{
mov [_save_ebx], ebx
pxor mm0, mm0 /* generate constant 0 */ pxor mm0, mm0 /* generate constant 0 */
mov esi, [_src] mov esi, [_src]
mov edi, [_residue] mov edi, [_residue]
@ -143,6 +148,7 @@ nextchunk:
movq [edx + ebx], mm7 /* #2 write row */ movq [edx + ebx], mm7 /* #2 write row */
lea edx, [edx+ebx*2] /* dst += stride * 2 */ lea edx, [edx+ebx*2] /* dst += stride * 2 */
jne nextchunk jne nextchunk
mov ebx, [_save_ebx]
} }
} }
@ -150,6 +156,7 @@ nextchunk:
void oc_frag_recon_inter2_mmx(unsigned char *_dst, int _dst_ystride, void oc_frag_recon_inter2_mmx(unsigned char *_dst, int _dst_ystride,
const unsigned char *_src1, int _src1_ystride, const unsigned char *_src2, const unsigned char *_src1, int _src1_ystride, const unsigned char *_src2,
int _src2_ystride,const ogg_int16_t *_residue){ int _src2_ystride,const ogg_int16_t *_residue){
int _save_ebx;
/* --------------------------------------------------------------------- /* ---------------------------------------------------------------------
This function does the inter2 reconstruction step.The building of the This function does the inter2 reconstruction step.The building of the
average is done with a bit-twiddeling trick to avoid excessive register average is done with a bit-twiddeling trick to avoid excessive register
@ -166,6 +173,7 @@ void oc_frag_recon_inter2_mmx(unsigned char *_dst, int _dst_ystride,
using the pavgb instruction let me know and I'll do the 3dnow codepath. using the pavgb instruction let me know and I'll do the 3dnow codepath.
--------------------------------------------------------------------- */ --------------------------------------------------------------------- */
_asm{ _asm{
mov [_save_ebx], ebx
mov eax, 0xfefefefe mov eax, 0xfefefefe
mov esi, [_src1] mov esi, [_src1]
mov edi, [_src2] mov edi, [_src2]
@ -204,6 +212,7 @@ nextrow:
packuswb mm2, mm3 /* pack and saturate */ packuswb mm2, mm3 /* pack and saturate */
movq [edx], mm2 /* write row */ movq [edx], mm2 /* write row */
jne nextrow jne nextrow
mov ebx, [_save_ebx]
} }
} }

View File

@ -38,7 +38,7 @@ static void loop_filter_v(unsigned char *_pix,int _ystride,
_asm { _asm {
mov eax, [_pix] mov eax, [_pix]
mov edx, [_ystride] mov edx, [_ystride]
mov ebx, [_ll] mov ecx, [_ll]
/* _pix -= ystride */ /* _pix -= ystride */
sub eax, edx sub eax, edx
@ -104,7 +104,7 @@ static void loop_filter_v(unsigned char *_pix,int _ystride,
/*Free up mm5.*/ /*Free up mm5.*/
packuswb mm4, mm5 packuswb mm4, mm5
/*mm0=L L L L*/ /*mm0=L L L L*/
movq mm0, [ebx] movq mm0, [ecx]
/*if(R_i<-2L||R_i>2L)R_i=0:*/ /*if(R_i<-2L||R_i>2L)R_i=0:*/
movq mm5, mm2 movq mm5, mm2
pxor mm6, mm6 pxor mm6, mm6

View File

@ -53,3 +53,4 @@ cp $1/include/theora/theora.h ./include/theora/theora.h
cp $1/include/theora/theoradec.h ./include/theora/theoradec.h cp $1/include/theora/theoradec.h ./include/theora/theoradec.h
cp $1/include/theora/codec.h ./include/theora/codec.h cp $1/include/theora/codec.h ./include/theora/codec.h
patch -p3 <455357_wince_local_variable_macro_clash_patch patch -p3 <455357_wince_local_variable_macro_clash_patch
patch -p0 <bug498770.patch