Bug 608166 - Add ARM assembly optimizations for libtheora r=chris.double,tterribe,khuey a=b-f

2024-11-24 13:21:05 +00:00 · 2010-11-08 09:47:34 +02:00 · 2010-11-08 09:47:34 +02:00 · bed266bd26
commit bed266bd26
parent dc49da79ab
62 changed files with 8124 additions and 2975 deletions
--- a/media/libtheora/AUTHORS
+++ b/media/libtheora/AUTHORS
@ -16,6 +16,9 @@ Nils Pipenbrinck
 Monty
 	- MMX optimized functions
 	
+David Schleef
+	- C64x port
+	
 Aaron Colwell
 Thomas Vander Stichele
 Jan Gerber
@ -45,5 +48,7 @@ Arc Riley
 Rodolphe Ortalo
 	- Bug fixes

+Robin Watts
+	- ARM code optimisations

 and other Xiph.org contributors
--- a/media/libtheora/CHANGES
+++ b/media/libtheora/CHANGES
@ -1,6 +1,26 @@
+libteora 1.2.0alpha1 (2010 September 23)
+
+- New 'ptalarbvorm' encoder with better rate/distortion optimization
+- New th_encode_ctl option for copying configuration from an existing
+  setup header, useful for splicing streams.
+- Returns TH_DUPFRAME in more cases.
+- Add ARM optimizations
+- Add TI C64x+ DSP optimizations
+- Other performance improvements
+- Rename speedlevel 2 to 3 and provide a new speedlevel 2
+- Various minor bug fixes
+
 libtheora 1.1.2 (unreleased snapshot)

- - no changes recorded
+ - Fix Huffman table decoding with OC_HUFF_SLUSH is set to 0
+ - Fix a frame size bug in player_example
+ - Add support for passing a buffer the size of the picture
+   region, rather than a full padded frame to th_encode_ycbcr_in()
+   as was possible with the legacy pre-1.0 API.
+ - 4:4:4 support in player_example using software yuv->rgb
+ - Better rgb->yuv conversion in png2theora
+ - Clean up warnings and local variables
+ - Build and documentation fixes

 libtheora 1.1.1 (2009 October 1)

@ -128,7 +148,7 @@ libtheora 1.0beta1 (2007 September 22)
 - Granulepos scheme modified to match other codecs. This bumps
   the bitstream revision to 3.2.1. Bitstreams marked 3.2.0 are
   handled correctly by this decoder. Older decoders will show
-   a one frame sync error in the less noticable direction.
+   a one frame sync error in the less noticeable direction.

 libtheora 1.0alpha8 (2007 September 18)

--- a/media/libtheora/README
+++ b/media/libtheora/README
@ -1,5 +1,5 @@
 -------------------------------------------------------------------------
-             The Xiph.org Foundation's libtheora 1.1 
+             The Xiph.org Foundation's libtheora 1.2
 -------------------------------------------------------------------------

 *** What is Theora?
@ -12,10 +12,13 @@ while allow it a longer useful lifetime as an competitive codec.
 The 1.0 release decoder supported all the new features, but the
 encoder is nearly identical to the VP3 code.

-The 1.1 release features a completely rewritten encoder, offering
+The 1.1 release featured a completely rewritten encoder, offering
 better performance and compression, and making more complete use
-of the format's feature set. Files produced by both encoders can
-be decoded by either release.
+of the format's feature set.
+
+The 1.2 release features significant additional improvements in
+compression and performance. Files produced by newer encoders can
+be decoded by earlier releases.

 *** Where is Theora?

@ -41,6 +44,7 @@ Requirements summary:
      as above,

      libvorbis and libvorbisenc 1.0.1 or newer.
+      (libvorbis 1.3.1 or newer for 5.1 audio)

  For creating a source distribution package:

@ -66,7 +70,7 @@ Windows build support is included in the win32 directory.

 Project files for Apple XCode are included in the macosx directory.

-There is also an experimental scons build.
+There is also a more limited scons build.

 *** How do I use the sample encoder?

--- a/media/libtheora/README_MOZILLA
+++ b/media/libtheora/README_MOZILLA
@ -2,6 +2,4 @@ The source from this directory was copied from the theora subversion trunk
 using the update.sh script. The changes made were those applied by update.sh,
 the addition/update of Makefile.in files for the Mozilla build system.

-The subversion revision used was r16712.
-
-bug559343.patch: Silence Coverity warning.
+The subversion revision used was r17578.
--- a/media/libtheora/bug559343.patch
+++ b/media/libtheora/bug559343.patch
@ -1,22 +0,0 @@
-diff --git a/media/libtheora/lib/state.c b/media/libtheora/lib/state.c
--- a/media/libtheora/lib/state.c
-+++ b/media/libtheora/lib/state.c
-@@ -87,17 +87,17 @@ static void oc_sb_create_plane_mapping(o
-       int       quadi;
-       int       i;
-       /*Figure out how many rows of blocks in this super block lie within the
-          image.*/
-       jmax=_hfrags-x;
-       if(jmax>4)jmax=4;
-       else if(jmax<=0)break;
-       /*By default, set all fragment indices to -1.*/
-      memset(_sb_maps[sbi][0],0xFF,sizeof(_sb_maps[sbi]));
-+      memset(_sb_maps[sbi],0xFF,sizeof(_sb_maps[sbi]));
-       /*Fill in the fragment map for this super block.*/
-       xfrag=yfrag+x;
-       for(i=0;i<imax;i++){
-         int j;
-         for(j=0;j<jmax;j++){
-           _sb_maps[sbi][SB_MAP[i][j][0]][SB_MAP[i][j][1]]=xfrag+j;
-         }
-         xfrag+=_hfrags;
--- a/media/libtheora/include/theora/config.h
+++ b/media/libtheora/include/theora/config.h
@ -1,90 +0,0 @@
-/* config.h.  Generated from config.h.in by configure.  */
-/* config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* libcairo is available for visual debugging output */
-/* #undef HAVE_CAIRO */
-
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#define HAVE_DLFCN_H 1
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the <machine/soundcard.h> header file. */
-/* #undef HAVE_MACHINE_SOUNDCARD_H */
-
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
-/* Define to 1 if you have the <soundcard.h> header file. */
-/* #undef HAVE_SOUNDCARD_H */
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the <sys/soundcard.h> header file. */
-/* #undef HAVE_SYS_SOUNDCARD_H */
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#define HAVE_UNISTD_H 1
-
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
-   */
-#define LT_OBJDIR ".libs/"
-
-/* Define to 1 if your C compiler doesn't accept -c and -o together. */
-/* #undef NO_MINUS_C_MINUS_O */
-
-/* make use of x86_64 asm optimization */
-/* #undef OC_X86_64_ASM */
-
-/* make use of x86 asm optimization */
- /**/
-
-/* Name of package */
-#define PACKAGE "libtheora"
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT ""
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME "libtheora"
-
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "libtheora 1.1.1+svn"
-
-/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME "libtheora"
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "1.1.1+svn"
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
-
-/* Define to exclude encode support from the build */
-/* #undef THEORA_DISABLE_ENCODE */
-
-/* Define to exclude floating point code from the build */
-/* #undef THEORA_DISABLE_FLOAT */
-
-/* Version number of package */
-#define VERSION "1.1.1+svn"
--- a/media/libtheora/include/theora/theora.h
+++ b/media/libtheora/include/theora/theora.h
@ -179,7 +179,7 @@ typedef enum {
  OC_PF_420,    /**< Chroma subsampling by 2 in each direction (4:2:0) */
  OC_PF_RSVD,   /**< Reserved value */
  OC_PF_422,    /**< Horizonatal chroma subsampling by 2 (4:2:2) */
-  OC_PF_444,    /**< No chroma subsampling at all (4:4:4) */
+  OC_PF_444     /**< No chroma subsampling at all (4:4:4) */
 } theora_pixelformat;

 /**
--- a/media/libtheora/include/theora/theoradec.h
+++ b/media/libtheora/include/theora/theoradec.h
@ -283,7 +283,8 @@ extern int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
 * \retval 0             Success.
 *                       A new decoded frame can be retrieved by calling
 *                        th_decode_ycbcr_out().
- * \retval TH_DUPFRAME   The packet represented a dropped (0-byte) frame.
+ * \retval TH_DUPFRAME   The packet represented a dropped frame (either a
+ *                        0-byte frame or an INTER frame with no coded blocks).
 *                       The player can skip the call to th_decode_ycbcr_out(),
 *                        as the contents of the decoded frame buffer have not
 *                        changed.
--- a/media/libtheora/include/theora/theoraenc.h
+++ b/media/libtheora/include/theora/theoraenc.h
@ -43,7 +43,7 @@ extern "C" {
 * <tt>NULL</tt> may be specified to revert to the default tables.
 *
 * \param[in] _buf <tt>#th_huff_code[#TH_NHUFFMAN_TABLES][#TH_NDCT_TOKENS]</tt>
- * \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
 * \retval TH_EINVAL Encoding has already begun or one or more of the given
 *                     tables is not full or prefix-free, \a _buf is
 *                     <tt>NULL</tt> and \a _buf_sz is not zero, or \a _buf is
@ -57,7 +57,7 @@ extern "C" {
 * <tt>NULL</tt> may be specified to revert to the default parameters.
 *
 * \param[in] _buf #th_quant_info
- * \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
 * \retval TH_EINVAL Encoding has already begun, \a _buf is 
 *                    <tt>NULL</tt> and \a _buf_sz is not zero,
 *                    or \a _buf is non-<tt>NULL</tt> and
@ -73,7 +73,7 @@ extern "C" {
 * \param[in]  _buf <tt>ogg_uint32_t</tt>: The maximum distance between key
 *                   frames.
 * \param[out] _buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
 * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
 * \retval TH_EIMPL   Not supported by this implementation.*/
 #define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
@ -101,7 +101,7 @@ extern "C" {
 *                   4:2:0, the picture region is smaller than the full frame,
 *                   or if encoding has begun, preventing the quantization
 *                   tables and codebooks from being set.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
 * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
 * \retval TH_EIMPL   Not supported by this implementation.*/
 #define TH_ENCCTL_SET_VP3_COMPATIBLE (10)
@ -114,7 +114,7 @@ extern "C" {
 *  the current encoding mode (VBR vs. constant quality, etc.).
 *
 * \param[out] _buf <tt>int</tt>: The maximum encoding speed level.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
 * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
 * \retval TH_EIMPL   Not supported by this implementation in the current
 *                    encoding mode.*/
@ -124,7 +124,7 @@ extern "C" {
 *
 * \param[in] _buf <tt>int</tt>: The new encoding speed level.
 *                 0 is slowest, larger values use less CPU.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
 * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
 *                    encoding speed level is out of bounds.
 *                   The maximum encoding speed level may be
@ -142,7 +142,7 @@ extern "C" {
 *
 * \param[out] _buf <tt>int</tt>: The current encoding speed level.
 *                  0 is slowest, larger values use less CPU.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
 * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
 * \retval TH_EIMPL   Not supported by this implementation in the current
 *                    encoding mode.*/
@ -162,7 +162,7 @@ extern "C" {
 *
 * \param[in] _buf <tt>int</tt>: The number of duplicates to produce.
 *                 If this is negative or zero, no duplicates will be produced.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
 * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
 *                    number of duplicates is greater than or equal to the
 *                    maximum keyframe interval.
@ -187,7 +187,7 @@ extern "C" {
 *                    use.
 *                 - #TH_RATECTL_CAP_UNDERFLOW: Don't try to make up shortfalls
 *                    later.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
 * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt> or rate control
 *                    is not enabled.
 * \retval TH_EIMPL   Not supported by this implementation in the current
@ -211,7 +211,7 @@ extern "C" {
 * \param[in]  _buf <tt>int</tt>: Requested size of the reservoir measured in
 *                   frames.
 * \param[out] _buf <tt>int</tt>: The actual size of the reservoir set.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
 * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or rate control
 *                    is not enabled.  The buffer has an implementation
 *                    defined minimum and maximum size and the value in _buf
@ -243,7 +243,7 @@ extern "C" {
 *              application.
 * \retval >=0       The number of bytes of metric data available in the
 *                    returned buffer.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
 * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(char *)</tt>, no target
 *                    bitrate has been set, or the first call was made after
 *                    the first frame was submitted for encoding.
@ -283,7 +283,7 @@ extern "C" {
 *                  of bytes consumed.
 * \retval >0            The number of bytes of metric data required/consumed.
 * \retval 0             No more data is required before the next frame.
- * \retval TH_EFAULT     \a _enc_ctx is <tt>NULL</tt>.
+ * \retval TH_EFAULT     \a _enc is <tt>NULL</tt>.
 * \retval TH_EINVAL     No target bitrate has been set, or the first call was
 *                        made after the first frame was submitted for
 *                        encoding.
@ -306,7 +306,7 @@ extern "C" {
 * \param[in] _buf <tt>int</tt>: The new target quality, in the range 0...63,
 *                  inclusive.
 * \retval 0             Success.
- * \retval TH_EFAULT     \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
 * \retval TH_EINVAL     A target bitrate has already been specified, or the
 *                        quality index was not in the range 0...63.
 * \retval TH_EIMPL       Not supported by this implementation.*/
@ -328,10 +328,50 @@ extern "C" {
 *
 * \param[in] _buf <tt>long</tt>: The new target bitrate, in bits per second.
 * \retval 0             Success.
- * \retval TH_EFAULT     \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
 * \retval TH_EINVAL     The target bitrate was not positive.
 * \retval TH_EIMPL       Not supported by this implementation.*/
 #define TH_ENCCTL_SET_BITRATE (30)
+/**Sets the configuration to be compatible with that from the given setup
+ *  header.
+ * This sets the Huffman codebooks and quantization parameters to match those
+ *  found in the given setup header.
+ * This guarantees that packets encoded by this encoder will be decodable using
+ *  a decoder configured with the passed-in setup header.
+ * It does <em>not</em> guarantee that th_encode_flushheader() will produce a
+ *  bit-identical setup header, only that they will be compatible.
+ * If you need a bit-identical setup header, then use the one you passed into
+ *  this command, and not the one returned by th_encode_flushheader().
+ *
+ * This also does <em>not</em> enable or disable VP3 compatibility; that is not
+ *  signaled in the setup header (or anywhere else in the encoded stream), and
+ *  is controlled independently by the #TH_ENCCTL_SET_VP3_COMPATIBLE function.
+ * If you wish to enable VP3 compatibility mode <em>and</em> want the codebooks
+ *  and quantization parameters to match the given setup header, you should
+ *  enable VP3 compatibility before invoking this command, otherwise the
+ *  codebooks and quantization parameters will be reset to the VP3 defaults.
+ *
+ * The current encoder does not support Huffman codebooks which do not contain
+ *  codewords for all 32 tokens.
+ * Such codebooks are legal, according to the specification, but cannot be
+ *  configured with this function.
+ *
+ * \param[in] _buf <tt>unsigned char[]</tt>: The encoded setup header to copy
+ *                                            the configuration from.
+ *                                           This should be the original,
+ *                                            undecoded setup header packet,
+ *                                            and <em>not</em> a #th_setup_info
+ *                                            structure filled in by
+ *                                            th_decode_headerin().
+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL     Encoding has already begun, so the codebooks and
+ *                        quantization parameters cannot be changed, or the
+ *                        data in the setup header was not supported by this
+ *                        encoder.
+ * \retval TH_EBADHEADER \a _buf did not contain a valid setup header packet.
+ * \retval TH_ENOTFORMAT \a _buf did not contain a Theora header at all.
+ * \retval TH_EIMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_COMPAT_CONFIG (32)

 /*@}*/

@ -441,11 +481,25 @@ extern int th_encode_flushheader(th_enc_ctx *_enc,
 /**Submits an uncompressed frame to the encoder.
 * \param _enc   A #th_enc_ctx handle.
 * \param _ycbcr A buffer of Y'CbCr data to encode.
+ *               If the width and height of the buffer matches the frame size
+ *                the encoder was initialized with, the encoder will only
+ *                reference the portion inside the picture region.
+ *               Any data outside this region will be ignored, and need not map
+ *                to a valid address.
+ *               Alternatively, you can pass a buffer equal to the size of the
+ *                picture region, if this is less than the full frame size.
+ *               When using subsampled chroma planes, odd picture sizes or odd
+ *                picture offsets may require an unexpected chroma plane size,
+ *                and their use is generally discouraged, as they will not be
+ *                well-supported by players and other media frameworks.
+ *               See Section 4.4 of 
+ *                <a href="http://www.theora.org/doc/Theora.pdf">the Theora
+ *                specification</a> for details if you wish to use them anyway.
 * \retval 0         Success.
 * \retval TH_EFAULT \a _enc or \a _ycbcr is <tt>NULL</tt>.
- * \retval TH_EINVAL The buffer size does not match the frame size the encoder
- *                    was initialized with, or encoding has already
- *                    completed.*/
+ * \retval TH_EINVAL The buffer size matches neither the frame size nor the
+ *                    picture size the encoder was initialized with, or
+ *                    encoding has already completed.*/
 extern int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _ycbcr);
 /**Retrieves encoded video data packets.
 * This should be called repeatedly after each frame is submitted to flush any
--- a/media/libtheora/lib/Makefile.in
+++ b/media/libtheora/lib/Makefile.in
@ -34,15 +34,15 @@
 #
 # ***** END LICENSE BLOCK *****

-DEPTH		= ../../..
-topsrcdir	= @top_srcdir@
-srcdir		= @srcdir@
+DEPTH     = ../../..
+topsrcdir = @top_srcdir@
+srcdir    = @srcdir@

 include $(DEPTH)/config/autoconf.mk

-MODULE		= theora
-LIBRARY_NAME	= theora
-FORCE_STATIC_LIB= 1
+MODULE           = theora
+LIBRARY_NAME     = theora
+FORCE_STATIC_LIB = 1

 # The encoder is currently not included.
 DEFINES += -DTHEORA_DISABLE_ENCODE
@ -50,51 +50,103 @@ DEFINES += -DTHEORA_DISABLE_ENCODE
 ifeq ($(findstring 86,$(OS_TEST)), 86)
 ifneq ($(OS_ARCH),SunOS)
 ifneq ($(OS_ARCH)$(OS_TEST),WINNTx86_64)
-DEFINES += -DOC_X86_ASM -DUSE_ASM
+DEFINES += -DOC_X86_ASM
+ifeq (64,$(findstring 64,$(OS_TEST)))
+DEFINES += -DOC_X86_64_ASM
+endif
 endif
 endif
 endif

-VPATH		:= $(srcdir)
+VPATH := $(srcdir)

-CSRCS		= \
-		apiwrapper.c \
-		bitpack.c \
-		decapiwrapper.c \
-		decinfo.c \
-		decode.c \
-		dequant.c \
-		encoder_disabled.c \
-		fragment.c \
-		huffdec.c \
-		idct.c \
-		info.c \
-		internal.c \
-		quant.c \
-		state.c \
-		$(NULL)
+CSRCS = \
+  apiwrapper.c \
+  bitpack.c \
+  decapiwrapper.c \
+  decinfo.c \
+  decode.c \
+  dequant.c \
+  fragment.c \
+  huffdec.c \
+  idct.c \
+  info.c \
+  internal.c \
+  quant.c \
+  state.c \
+  $(NULL)

 ifeq ($(findstring 86,$(OS_TEST)), 86)
 ifdef _MSC_VER
 ifneq (64,$(findstring 64,$(OS_TEST)))
-VPATH		+= $(srcdir)/x86_vc
+VPATH += $(srcdir)/x86_vc

-CSRCS		+= \
-		mmxidct.c \
-		mmxfrag.c \
-		mmxstate.c \
-		x86state.c \
-		$(NULL)
+CSRCS += \
+  mmxidct.c \
+  mmxfrag.c \
+  mmxstate.c \
+  x86state.c \
+  x86cpu.c \
+  $(NULL)
 endif
 else
-VPATH		+= $(srcdir)/x86
+VPATH += $(srcdir)/x86
+
+CSRCS += \
+  mmxidct.c \
+  mmxfrag.c \
+  mmxstate.c \
+  sse2idct.c \
+  x86state.c \
+  x86cpu.c \
+  $(NULL)
+endif
+endif
+
+ifdef GNU_AS
+ifeq ($(findstring arm,$(OS_TEST)), arm)
+
+VPATH += $(srcdir)/arm
+
+CSRCS += \
+  armcpu.c \
+  armstate.c \
+  $(NULL)
+
+DEFINES += -DOC_ARM_ASM -DOC_ARM_ASM_EDSP -DOC_ARM_ASM_MEDIA -DOC_ARM_ASM_NEON
+
+# The Android NDK doesn't pre-define anything to indicate the OS it's on, so
+# do it for them.
+ifeq ($(OS_TARGET),Android)
+DEFINES += -D__linux__
+endif
+
+THEORA_ASFILES  = \
+  armbits.s \
+  armfrag.s \
+  armidct.s \
+  armloop.s \
+  armopts.s \
+  $(NULL)
+
+ASFILES = $(patsubst %.s,%-gnu.$(ASM_SUFFIX),$(THEORA_ASFILES))
+
+# These flags are a lie; they're just used to enable the requisite
+# opcodes; actual arch detection is done at runtime.
+ASFLAGS = -march=armv7-a -mfpu=neon
+
+armfrag-gnu.$(ASM_SUFFIX): armopts-gnu.S
+armidct-gnu.$(ASM_SUFFIX): armopts-gnu.S
+armloop-gnu.$(ASM_SUFFIX): armopts-gnu.S
+
+# armopts needs a specific rule, because arm2gnu.pl will always add the .S
+# suffix when translating the files that include it.
+armopts-gnu.S: armopts.s
+	$(PERL) $(srcdir)/arm/arm2gnu.pl < $< > $@
+# For all others, we can use an implicit rule with the configured $(ASM_SUFFIX).
+%-gnu.$(ASM_SUFFIX): %.s
+	$(PERL) $(srcdir)/arm/arm2gnu.pl < $< > $@

-CSRCS		+= \
-		mmxidct.c \
-		mmxfrag.c \
-		mmxstate.c \
-		x86state.c \
-		$(NULL)
 endif
 endif

--- a/media/libtheora/lib/apiwrapper.h
+++ b/media/libtheora/lib/apiwrapper.h
@ -21,7 +21,7 @@
 # include <theora/theora.h>
 # include "theora/theoradec.h"
 # include "theora/theoraenc.h"
-# include "internal.h"
+# include "state.h"

 typedef struct th_api_wrapper th_api_wrapper;
 typedef struct th_api_info    th_api_info;
--- a/media/libtheora/lib/arm/arm2gnu.pl
+++ b/media/libtheora/lib/arm/arm2gnu.pl
@ -0,0 +1,271 @@
+#!/usr/bin/perl
+
+my $bigend;  # little/big endian
+
+eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
+    if $running_under_some_shell;
+
+while ($ARGV[0] =~ /^-/) {
+    $_ = shift;
+  last if /^--/;
+    if (/^-n/) {
+    $nflag++;
+    next;
+    }
+    die "I don't recognize this switch: $_\\n";
+}
+$printit++ unless $nflag;
+
+$\ = "\n";      # automatically add newline on print
+$n=0;
+
+$thumb = 0;     # ARM mode by default, not Thumb.
+
+LINE:
+while (<>) {
+
+    # For ADRLs we need to add a new line after the substituted one.
+    $addPadding = 0;
+
+    # First, we do not dare to touch *anything* inside double quotes, do we?
+    # Second, if you want a dollar character in the string,
+    # insert two of them -- that's how ARM C and assembler treat strings.
+    s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1:   .ascii \"/   && do { s/\$\$/\$/g; next };
+    s/\bDCB\b[ \t]*\"/.ascii \"/                          && do { s/\$\$/\$/g; next };
+    s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/                    && do { s/\$\$/\$/g; next };
+    # If there's nothing on a line but a comment, don't try to apply any further
+    #  substitutions (this is a cheap hack to avoid mucking up the license header)
+    s/^([ \t]*);/$1@/                                     && do { s/\$\$/\$/g; next };
+    # If substituted -- leave immediately !
+
+    s/@/,:/;
+    s/;/@/;
+    while ( /@.*'/ ) {
+      s/(@.*)'/$1/g;
+    }
+    s/\{FALSE\}/0/g;
+    s/\{TRUE\}/1/g;
+    s/\{(\w\w\w\w+)\}/$1/g;
+    s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
+    s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
+    s/\bIMPORT\b/.extern/;
+    s/\bEXPORT\b/.global/;
+    s/^(\s+)\[/$1IF/;
+    s/^(\s+)\|/$1ELSE/;
+    s/^(\s+)\]/$1ENDIF/;
+    s/IF *:DEF:/ .ifdef/;
+    s/IF *:LNOT: *:DEF:/ .ifndef/;
+    s/ELSE/ .else/;
+    s/ENDIF/ .endif/;
+
+    if( /\bIF\b/ ) {
+      s/\bIF\b/ .if/;
+      s/=/==/;
+    }
+    if ( $n == 2) {
+        s/\$/\\/g;
+    }
+    if ($n == 1) {
+        s/\$//g;
+        s/label//g;
+    $n = 2;
+      }
+    if ( /MACRO/ ) {
+      s/MACRO *\n/.macro/;
+      $n=1;
+    }
+    if ( /\bMEND\b/ ) {
+      s/\bMEND\b/.endm/;
+      $n=0;
+    }
+
+    # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.
+    #
+    if ( /\bAREA\b/ ) {
+        s/^(.+)CODE(.+)READONLY(.*)/    .text/;
+        s/^(.+)DATA(.+)READONLY(.*)/    .section .rdata\n    .align 2/;
+        s/^(.+)\|\|\.data\|\|(.+)/    .data\n    .align 2/;
+        s/^(.+)\|\|\.bss\|\|(.+)/    .bss/;
+    }
+
+    s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/;       # ||.constdata$3||
+    s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/;               # ||.bss$2||
+    s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/;             # ||.data$2||
+    s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;
+    s/^(\s+)\%(\s)/    .space $1/;
+
+    s/\|(.+)\.(\d+)\|/\.$1_$2/;                     # |L80.123| -> .L80_123
+    s/\bCODE32\b/.code 32/ && do {$thumb = 0};
+    s/\bCODE16\b/.code 16/ && do {$thumb = 1};
+    if (/\bPROC\b/)
+    {
+        print "    .thumb_func" if ($thumb);
+        s/\bPROC\b/@ $&/;
+    }
+    s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;
+    s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;
+    s/\bENDP\b/@ $&/;
+    s/\bSUBT\b/@ $&/;
+    s/\bDATA\b/@ $&/;   # DATA directive is deprecated -- Asm guide, p.7-25
+    s/\bKEEP\b/@ $&/;
+    s/\bEXPORTAS\b/@ $&/;
+    s/\|\|(.)+\bEQU\b/@ $&/;
+    s/\|\|([\w\$]+)\|\|/$1/;
+    s/\bENTRY\b/@ $&/;
+    s/\bASSERT\b/@ $&/;
+    s/\bGBLL\b/@ $&/;
+    s/\bGBLA\b/@ $&/;
+    s/^\W+OPT\b/@ $&/;
+    s/:OR:/|/g;
+    s/:SHL:/<</g;
+    s/:SHR:/>>/g;
+    s/:AND:/&/g;
+    s/:LAND:/&&/g;
+    s/CPSR/cpsr/;
+    s/SPSR/spsr/;
+    s/ALIGN$/.balign 4/;
+    s/ALIGN\s+([0-9x]+)$/.balign $1/;
+    s/psr_cxsf/psr_all/;
+    s/LTORG/.ltorg/;
+    s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;
+
+    #  {PC} + 0xdeadfeed  -->  . + 0xdeadfeed
+    s/\{PC\} \+/ \. +/;
+
+    # Single hex constant on the line !
+    #
+    # >>> NOTE <<<
+    #   Double-precision floats in gcc are always mixed-endian, which means
+    #   bytes in two words are little-endian, but words are big-endian.
+    #   So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address
+    #   and 0xfeed0000 at high address.
+    #
+    s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;
+    # Only decimal constants on the line, no hex !
+    s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;
+
+    # Single hex constant on the line !
+#    s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;
+    # Only decimal constants on the line, no hex !
+#    s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;
+    s/\bDCFS[ \t]+0x/.word 0x/;
+    s/\bDCFS\b/.float/;
+
+    s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;
+    s/\bDCD\b/.word/;
+    s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;
+    s/\bDCW\b/.short/;
+    s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;
+    s/\bDCB\b/.byte/;
+    s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;
+    s/^[A-Za-z_\.]\w+/$&:/;
+    s/^(\d+)/$1:/;
+    s/\%(\d+)/$1b_or_f/;
+    s/\%[Bb](\d+)/$1b/;
+    s/\%[Ff](\d+)/$1f/;
+    s/\%[Ff][Tt](\d+)/$1f/;
+    s/&([\dA-Fa-f]+)/0x$1/;
+    if ( /\b2_[01]+\b/ ) {
+      s/\b2_([01]+)\b/conv$1&&&&/g;
+      while ( /[01][01][01][01]&&&&/ ) {
+        s/0000&&&&/&&&&0/g;
+        s/0001&&&&/&&&&1/g;
+        s/0010&&&&/&&&&2/g;
+        s/0011&&&&/&&&&3/g;
+        s/0100&&&&/&&&&4/g;
+        s/0101&&&&/&&&&5/g;
+        s/0110&&&&/&&&&6/g;
+        s/0111&&&&/&&&&7/g;
+        s/1000&&&&/&&&&8/g;
+        s/1001&&&&/&&&&9/g;
+        s/1010&&&&/&&&&A/g;
+        s/1011&&&&/&&&&B/g;
+        s/1100&&&&/&&&&C/g;
+        s/1101&&&&/&&&&D/g;
+        s/1110&&&&/&&&&E/g;
+        s/1111&&&&/&&&&F/g;
+      }
+      s/000&&&&/&&&&0/g;
+      s/001&&&&/&&&&1/g;
+      s/010&&&&/&&&&2/g;
+      s/011&&&&/&&&&3/g;
+      s/100&&&&/&&&&4/g;
+      s/101&&&&/&&&&5/g;
+      s/110&&&&/&&&&6/g;
+      s/111&&&&/&&&&7/g;
+      s/00&&&&/&&&&0/g;
+      s/01&&&&/&&&&1/g;
+      s/10&&&&/&&&&2/g;
+      s/11&&&&/&&&&3/g;
+      s/0&&&&/&&&&0/g;
+      s/1&&&&/&&&&1/g;
+      s/conv&&&&/0x/g;
+    }
+
+    if ( /commandline/)
+    {
+        if( /-bigend/)
+        {
+            $bigend=1;
+        }
+    }
+
+    if ( /\bDCDU\b/ )
+    {
+        my $cmd=$_;
+        my $value;
+        my $w1;
+        my $w2;
+        my $w3;
+        my $w4;
+
+        s/\s+DCDU\b/@ $&/;
+
+        $cmd =~ /\bDCDU\b\s+0x(\d+)/;
+        $value = $1;
+        $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;
+        $w1 = $1;
+        $w2 = $2;
+        $w3 = $3;
+        $w4 = $4;
+
+        if( $bigend ne "")
+        {
+            # big endian
+
+            print "        .byte      0x".$w1;
+            print "        .byte      0x".$w2;
+            print "        .byte      0x".$w3;
+            print "        .byte      0x".$w4;
+        }
+        else
+        {
+            # little endian
+
+            print "        .byte      0x".$w4;
+            print "        .byte      0x".$w3;
+            print "        .byte      0x".$w2;
+            print "        .byte      0x".$w1;
+        }
+
+    }
+
+
+    if ( /\badrl\b/i )
+    {
+        s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;
+        $addPadding = 1;
+    }
+    s/\bEND\b/@ END/;
+} continue {
+    printf ("%s", $_) if $printit;
+    if ($addPadding != 0)
+    {
+        printf ("   mov r0,r0\n");
+        $addPadding = 0;
+    }
+}
+
--- a/media/libtheora/lib/arm/armbits.h
+++ b/media/libtheora/lib/arm/armbits.h
@ -0,0 +1,32 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armbits_H)
+# define _arm_armbits_H (1)
+# include "../bitpack.h"
+# include "armcpu.h"
+
+# if defined(OC_ARM_ASM)
+#  define oc_pack_read oc_pack_read_arm
+#  define oc_pack_read1 oc_pack_read1_arm
+#  define oc_huff_token_decode oc_huff_token_decode_arm
+# endif
+
+long oc_pack_read_arm(oc_pack_buf *_b,int _bits);
+int oc_pack_read1_arm(oc_pack_buf *_b);
+int oc_huff_token_decode_arm(oc_pack_buf *_b,const ogg_int16_t *_tree);
+
+#endif
--- a/media/libtheora/lib/arm/armbits.s
+++ b/media/libtheora/lib/arm/armbits.s
@ -0,0 +1,230 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+;
+; function:
+;   last mod: $Id: armbits.s 17481 2010-10-03 22:49:42Z tterribe $
+;
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	EXPORT oc_pack_read_arm
+	EXPORT oc_pack_read1_arm
+	EXPORT oc_huff_token_decode_arm
+
+oc_pack_read1_arm PROC
+	; r0 = oc_pack_buf *_b
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      ; r2 = window
+	; Stall...             ; r3 = available
+	; Stall...
+	SUBS r3,r3,#1          ; r3 = available-1, available<1 => LT
+	BLT oc_pack_read1_refill
+	MOV r0,r2,LSR #31      ; r0 = window>>31
+	MOV r2,r2,LSL #1       ; r2 = window<<=1
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	MOV PC,r14
+	ENDP
+
+oc_pack_read_arm PROC
+	; r0 = oc_pack_buf *_b
+	; r1 = int          _bits
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      ; r2 = window
+	; Stall...             ; r3 = available
+	; Stall...
+	SUBS r3,r3,r1          ; r3 = available-_bits, available<_bits => LT
+	BLT oc_pack_read_refill
+	RSB r0,r1,#32          ; r0 = 32-_bits
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	MOV PC,r14
+
+; We need to refill window.
+oc_pack_read1_refill
+	MOV r1,#1
+oc_pack_read_refill
+	STMFD r13!,{r10,r11,r14}
+	LDMIA r0,{r10,r11}     ; r10 = stop
+	                       ; r11 = ptr
+	RSB r0,r1,#32          ; r0 = 32-_bits
+	RSB r3,r3,r0           ; r3 = 32-available
+; We can use unsigned compares for both the pointers and for available
+;  (allowing us to chain condition codes) because available will never be
+;  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+;  negative.
+	CMP r10,r11            ; ptr<stop => HI
+	CMPHI r3,#7            ;   available<=24 => HI
+	LDRHIB r14,[r11],#1    ;     r14 = *ptr++
+	SUBHI r3,#8            ;     available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;     r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;     ptr<stop => HI
+	CMPHI r3,#7            ;       available<=24 => HI
+	LDRHIB r14,[r11],#1    ;         r14 = *ptr++
+	SUBHI r3,#8            ;         available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;         r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;         ptr<stop => HI
+	CMPHI r3,#7            ;           available<=24 => HI
+	LDRHIB r14,[r11],#1    ;             r14 = *ptr++
+	SUBHI r3,#8            ;             available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;             r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;             ptr<stop => HI
+	CMPHI r3,#7            ;               available<=24 => HI
+	LDRHIB r14,[r11],#1    ;                 r14 = *ptr++
+	SUBHI r3,#8            ;                 available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;                 r2 = window|=r14<<32-available
+	SUBS r3,r0,r3          ; r3 = available-=_bits, available<bits => GT
+	BLT oc_pack_read_refill_last
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STR r11,[r12,#-4]      ; ptr = r11
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	LDMFD r13!,{r10,r11,PC}
+
+; Either we wanted to read more than 24 bits and didn't have enough room to
+;  stuff the last byte into the window, or we hit the end of the packet.
+oc_pack_read_refill_last
+	CMP r11,r10            ; ptr<stop => LO
+; If we didn't hit the end of the packet, then pull enough of the next byte to
+;  to fill up the window.
+	LDRLOB r14,[r11]       ; (LO) r14 = *ptr
+; Otherwise, set the EOF flag and pretend we have lots of available bits.
+	MOVHS r14,#1           ; (HS) r14 = 1
+	ADDLO r10,r3,r1        ; (LO) r10 = available
+	STRHS r14,[r12,#8]     ; (HS) eof = 1
+	ANDLO r10,r10,#7       ; (LO) r10 = available&7
+	MOVHS r3,#1<<30        ; (HS) available = OC_LOTS_OF_BITS
+	ORRLO r2,r14,LSL r10   ; (LO) r2 = window|=*ptr>>(available&7)
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STR r11,[r12,#-4]      ; ptr = r11
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	LDMFD r13!,{r10,r11,PC}
+	ENDP
+
+
+
+oc_huff_token_decode_arm PROC
+	; r0 = oc_pack_buf       *_b
+	; r1 = const ogg_int16_t *_tree
+	STMFD r13!,{r4,r5,r10,r14}
+	LDRSH r10,[r1]         ; r10 = n=_tree[0]
+	LDMIA r0,{r2-r5}       ; r2 = stop
+	; Stall...             ; r3 = ptr
+	; Stall...             ; r4 = window
+	                       ; r5 = available
+	CMP r10,r5             ; n>available => GT
+	BGT oc_huff_token_decode_refill0
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r14,r1,r14,LSL #1  ; r14 = _tree+bits
+	LDRSH r12,[r14,#2]     ; r12 = node=_tree[1+bits]
+	; Stall...
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+; The first tree node wasn't enough to reach a leaf, read another
+oc_huff_token_decode_continue
+	ADD r12,r1,r12,LSL #1  ; r12 = _tree+node
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r5,r10          ; r5 = available-=n
+	LDRSH r10,[r12],#2     ; r10 = n=_tree[node]
+	; Stall...             ; r12 = _tree+node+1
+	; Stall...
+	CMP r10,r5             ; n>available => GT
+	BGT oc_huff_token_decode_refill
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r12,r12,r14        ;
+	LDRSH r12,[r12,r14]    ; r12 = node=_tree[node+1+bits]
+	; Stall...
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+oc_huff_token_decode_refill0
+	ADD r12,r1,#2          ; r12 = _tree+1
+oc_huff_token_decode_refill
+; We can't possibly need more than 15 bits, so available must be <= 15.
+; Therefore we can load at least two bytes without checking it.
+	CMP r2,r3              ; ptr<stop => HI
+	LDRHIB r14,[r3],#1     ;   r14 = *ptr++
+	RSBHI r5,r5,#24        ; (HI) available = 32-(available+=8)
+	RSBLS r5,r5,#32        ; (LS) r5 = 32-available
+	ORRHI r4,r14,LSL r5    ;   r4 = window|=r14<<32-available
+	CMPHI r2,r3            ;   ptr<stop => HI
+	LDRHIB r14,[r3],#1     ;     r14 = *ptr++
+	SUBHI r5,#8            ;     available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ;     r4 = window|=r14<<32-available
+; We can use unsigned compares for both the pointers and for available
+;  (allowing us to chain condition codes) because available will never be
+;  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+;  negative.
+	CMPHI r2,r3            ;     ptr<stop => HI
+	CMPHI r5,#7            ;       available<=24 => HI
+	LDRHIB r14,[r3],#1     ;         r14 = *ptr++
+	SUBHI r5,#8            ;         available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ;         r4 = window|=r14<<32-available
+	CMP r2,r3              ; ptr<stop => HI
+	MOVLS r5,#-1<<30       ; (LS) available = OC_LOTS_OF_BITS+32
+	CMPHI r5,#7            ; (HI) available<=24 => HI
+	LDRHIB r14,[r3],#1     ; (HI)   r14 = *ptr++
+	SUBHI r5,#8            ; (HI)   available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ; (HI)   r4 = window|=r14<<32-available
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r12,r12,r14        ;
+	LDRSH r12,[r12,r14]    ; r12 = node=_tree[node+1+bits]
+	RSB r5,r5,#32          ; r5 = available
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+	ENDP
+
+	END
--- a/media/libtheora/lib/arm/armcpu.c
+++ b/media/libtheora/lib/arm/armcpu.c
@ -0,0 +1,116 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for ARM processors.
+
+ function:
+  last mod: $Id: cpu.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+
+#include "armcpu.h"
+
+#if !defined(OC_ARM_ASM)|| \
+ !defined(OC_ARM_ASM_EDSP)&&!defined(OC_ARM_ASM_ARMV6)&& \
+ !defined(OC_ARM_ASM_NEON)
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+
+#elif defined(_MSC_VER)
+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+# define WIN32_LEAN_AND_MEAN
+# define WIN32_EXTRA_LEAN
+# include <windows.h>
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  flags=0;
+  /*MSVC has no inline __asm support for ARM, but it does let you __emit
+     instructions via their assembled hex code.
+    All of these instructions should be essentially nops.*/
+# if defined(OC_ARM_ASM_EDSP)
+  __try{
+    /*PLD [r13]*/
+    __emit(0xF5DDF000);
+    flags|=OC_CPU_ARM_EDSP;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  __try{
+    /*SHADD8 r3,r3,r3*/
+    __emit(0xE6333F93);
+    flags|=OC_CPU_ARM_MEDIA;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  __try{
+    /*VORR q0,q0,q0*/
+    __emit(0xF2200150);
+    flags|=OC_CPU_ARM_NEON;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   endif
+#  endif
+# endif
+  return flags;
+}
+
+#elif defined(__linux__)
+# include <stdio.h>
+# include <stdlib.h>
+# include <string.h>
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t  flags;
+  FILE         *fin;
+  flags=0;
+  /*Reading /proc/self/auxv would be easier, but that doesn't work reliably on
+     Android.
+    This also means that detection will fail in Scratchbox.*/
+  fin=fopen("/proc/cpuinfo","r");
+  if(fin!=NULL){
+    /*512 should be enough for anybody (it's even enough for all the flags that
+       x86 has accumulated... so far).*/
+    char buf[512];
+    while(fgets(buf,511,fin)!=NULL){
+      if(memcmp(buf,"Features",8)==0){
+        char *p;
+        p=strstr(buf," edsp");
+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_EDSP;
+        p=strstr(buf," neon");
+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_NEON;
+      }
+      if(memcmp(buf,"CPU architecture:",17)==0){
+        int version;
+        version=atoi(buf+17);
+        if(version>=6)flags|=OC_CPU_ARM_MEDIA;
+      }
+    }
+    fclose(fin);
+  }
+  return flags;
+}
+
+#else
+/*The feature registers which can tell us what the processor supports are
+   accessible in priveleged modes only, so we can't have a general user-space
+   detection method like on x86.*/
+# error "Configured to use ARM asm but no CPU detection method available for " \
+ "your platform.  Reconfigure with --disable-asm (or send patches)."
+#endif
--- a/media/libtheora/lib/arm/armcpu.h
+++ b/media/libtheora/lib/arm/armcpu.h
@ -0,0 +1,29 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_arm_armcpu_H)
+# define _arm_armcpu_H (1)
+#include "../internal.h"
+
+/*"Parallel instructions" from ARM v6 and above.*/
+#define OC_CPU_ARM_MEDIA    (1<<24)
+/*Flags chosen to match arch/arm/include/asm/hwcap.h in the Linux kernel.*/
+#define OC_CPU_ARM_EDSP     (1<<7)
+#define OC_CPU_ARM_NEON     (1<<12)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif
--- a/media/libtheora/lib/arm/armfrag.s
+++ b/media/libtheora/lib/arm/armfrag.s
@ -0,0 +1,656 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: armfrag.s 17481 2010-10-03 22:49:42Z tterribe $
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+; Vanilla ARM v4 versions
+	EXPORT	oc_frag_copy_list_arm
+	EXPORT	oc_frag_recon_intra_arm
+	EXPORT	oc_frag_recon_inter_arm
+	EXPORT	oc_frag_recon_inter2_arm
+
+oc_frag_copy_list_arm PROC
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r6,r11,r14}
+	SUBS	r12, r12, #1
+	LDR	r4,[r3],#4		; r4 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
+	BLT	ofcl_arm_end
+	SUB	r2, r2, #4
+ofcl_arm_lp
+	LDR	r11,[r14,r4,LSL #2]	; r11 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	; Stall (on XScale)
+	ADD	r4, r1, r11		; r4 = _src_frame+frag_buf_off
+	LDR	r6, [r4], #4
+	ADD	r11,r0, r11		; r11 = _dst_frame+frag_buf_off
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4]
+	LDRGE	r4,[r3],#4		; r4 = _fragis[fragii]
+	STR	r6, [r11],#4
+	STR	r5, [r11]
+	BGE	ofcl_arm_lp
+ofcl_arm_end
+	LDMFD	r13!,{r4-r6,r11,PC}
+oc_frag_recon_intra_arm
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4,r5,r14}
+	MOV	r14,#8
+	MOV	r5, #255
+	SUB	r1, r1, #7
+ofrintra_lp_arm
+	LDRSH	r3, [r2], #2
+	LDRSH	r4, [r2], #2
+	LDRSH	r12,[r2], #2
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	LDRSH	r12,[r2], #2
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	STRB	r4, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	ofrintra_lp_arm
+	LDMFD	r13!,{r4,r5,PC}
+	ENDP
+
+oc_frag_recon_inter_arm PROC
+	; r0 =       unsigned char *dst
+	; r1 = const unsigned char *src
+	; r2 =       int            ystride
+	; r3 = const ogg_int16_t    residue[64]
+	STMFD	r13!,{r5,r9-r11,r14}
+	MOV	r9, #8
+	MOV	r5, #255
+	SUB	r2, r2, #7
+ofrinter_lp_arm
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], r2
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], r2
+	SUBS	r9, r9, #1
+	BGT	ofrinter_lp_arm
+	LDMFD	r13!,{r5,r9-r11,PC}
+	ENDP
+
+oc_frag_recon_inter2_arm PROC
+	; r0 =       unsigned char *dst
+	; r1 = const unsigned char *src1
+	; r2 = const unsigned char *src2
+	; r3 =       int            ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    residue[64]
+	STMFD	r13!,{r4-r8,r14}
+	MOV	r14,#8
+	MOV	r8, #255
+	SUB	r3, r3, #7
+ofrinter2_lp_arm
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	LDRB	r7, [r1], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], r3
+	LDRB	r6, [r2], r3
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], r3
+	SUBS	r14,r14,#1
+	BGT	ofrinter2_lp_arm
+	LDMFD	r13!,{r4-r8,PC}
+	ENDP
+
+ [ OC_ARM_ASM_EDSP
+	EXPORT	oc_frag_copy_list_edsp
+
+oc_frag_copy_list_edsp PROC
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r11,r14}
+	SUBS	r12, r12, #1
+	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*10]		; r14 = _frag_buf_offs
+	BLT	ofcl_edsp_end
+ofcl_edsp_lp
+	MOV	r4, r1
+	LDR	r5, [r14,r5, LSL #2]	; r5 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	; Stall (on XScale)
+	LDRD	r6, [r4, r5]!		; r4 = _src_frame+frag_buf_off
+	LDRD	r8, [r4, r2]!
+	; Stall
+	STRD	r6, [r5, r0]!		; r5 = _dst_frame+frag_buf_off
+	STRD	r8, [r5, r2]!
+	; Stall
+	LDRD	r6, [r4, r2]!	; On Xscale at least, doing 3 consecutive
+	LDRD	r8, [r4, r2]!	; loads causes a stall, but that's no worse
+	LDRD	r10,[r4, r2]!	; than us only doing 2, and having to do
+				; another pair of LDRD/STRD later on.
+	; Stall
+	STRD	r6, [r5, r2]!
+	STRD	r8, [r5, r2]!
+	STRD	r10,[r5, r2]!
+	LDRD	r6, [r4, r2]!
+	LDRD	r8, [r4, r2]!
+	LDRD	r10,[r4, r2]!
+	STRD	r6, [r5, r2]!
+	STRD	r8, [r5, r2]!
+	STRD	r10,[r5, r2]!
+	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
+	BGE	ofcl_edsp_lp
+ofcl_edsp_end
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+ ]
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_frag_recon_intra_v6
+	EXPORT	oc_frag_recon_inter_v6
+	EXPORT	oc_frag_recon_inter2_v6
+
+oc_frag_recon_intra_v6 PROC
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r6,r14}
+	MOV	r14,#8
+	MOV	r12,r2
+	LDR	r6, =0x00800080
+ofrintra_v6_lp
+	LDRD	r2, [r12],#8	; r2 = 11110000 r3 = 33332222
+	LDRD	r4, [r12],#8	; r4 = 55554444 r5 = 77776666
+	SUBS	r14,r14,#1
+	QADD16	r2, r2, r6
+	QADD16	r3, r3, r6
+	QADD16	r4, r4, r6
+	QADD16	r5, r5, r6
+	USAT16	r2, #8, r2		; r2 = __11__00
+	USAT16	r3, #8, r3		; r3 = __33__22
+	USAT16	r4, #8, r4		; r4 = __55__44
+	USAT16	r5, #8, r5		; r5 = __77__66
+	ORR	r2, r2, r2, LSR #8	; r2 = __111100
+	ORR	r3, r3, r3, LSR #8	; r3 = __333322
+	ORR	r4, r4, r4, LSR #8	; r4 = __555544
+	ORR	r5, r5, r5, LSR #8	; r5 = __777766
+	PKHBT   r2, r2, r3, LSL #16     ; r2 = 33221100
+	PKHBT   r3, r4, r5, LSL #16     ; r3 = 77665544
+	STRD	r2, [r0], r1
+	BGT	ofrintra_v6_lp
+	LDMFD	r13!,{r4-r6,PC}
+	ENDP
+
+oc_frag_recon_inter_v6 PROC
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src
+	; r2 =       int            _ystride
+	; r3 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r7,r14}
+	MOV	r14,#8
+ofrinter_v6_lp
+	LDRD	r6, [r3], #8		; r6 = 11110000 r7 = 33332222
+	SUBS	r14,r14,#1
+ [ OC_ARM_CAN_UNALIGN_LDRD
+	LDRD	r4, [r1], r2	; Unaligned ; r4 = 33221100 r5 = 77665544
+ |
+	LDR	r5, [r1, #4]
+	LDR	r4, [r1], r2
+ ]
+	PKHBT	r12,r6, r7, LSL #16	; r12= 22220000
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
+	UXTB16	r6,r4			; r6 = __22__00
+	UXTB16	r4,r4, ROR #8		; r4 = __33__11
+	QADD16	r12,r12,r6		; r12= xx22xx00
+	QADD16	r4, r7, r4		; r4 = xx33xx11
+	LDRD	r6, [r3], #8		; r6 = 55554444 r7 = 77776666
+	USAT16	r4, #8, r4		; r4 = __33__11
+	USAT16	r12,#8,r12		; r12= __22__00
+	ORR	r4, r12,r4, LSL #8	; r4 = 33221100
+	PKHBT	r12,r6, r7, LSL #16	; r12= 66664444
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 77775555
+	UXTB16	r6,r5			; r6 = __66__44
+	UXTB16	r5,r5, ROR #8		; r5 = __77__55
+	QADD16	r12,r12,r6		; r12= xx66xx44
+	QADD16	r5, r7, r5		; r5 = xx77xx55
+	USAT16	r12,#8, r12		; r12= __66__44
+	USAT16	r5, #8, r5		; r4 = __77__55
+	ORR	r5, r12,r5, LSL #8	; r5 = 33221100
+	STRD	r4, [r0], r2
+	BGT	ofrinter_v6_lp
+	LDMFD	r13!,{r4-r7,PC}
+	ENDP
+
+oc_frag_recon_inter2_v6 PROC
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src1
+	; r2 = const unsigned char *_src2
+	; r3 =       int            _ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r9,r14}
+	MOV	r14,#8
+ofrinter2_v6_lp
+	LDRD	r6, [r12,#8]	; r6 = 55554444 r7 = 77776666
+	SUBS	r14,r14,#1
+	LDR	r4, [r1, #4]	; Unaligned	; r4 = src1[1] = 77665544
+	LDR	r5, [r2, #4]	; Unaligned	; r5 = src2[1] = 77665544
+	PKHBT	r8, r6, r7, LSL #16	; r8 = 66664444
+	PKHTB	r9, r7, r6, ASR #16	; r9 = 77775555
+	UHADD8	r4, r4, r5	; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
+	UXTB16	r5, r4			; r5 = __66__44
+	UXTB16	r4, r4, ROR #8		; r4 = __77__55
+	QADD16	r8, r8, r5		; r8 = xx66xx44
+	QADD16	r9, r9, r4		; r9 = xx77xx55
+	LDRD	r6,[r12],#16	; r6 = 33332222 r7 = 11110000
+	USAT16	r8, #8, r8		; r8 = __66__44
+	LDR	r4, [r1], r3	; Unaligned	; r4 = src1[0] = 33221100
+	USAT16	r9, #8, r9		; r9 = __77__55
+	LDR	r5, [r2], r3	; Unaligned	; r5 = src2[0] = 33221100
+	ORR	r9, r8, r9, LSL #8	; r9 = 77665544
+	PKHBT	r8, r6, r7, LSL #16	; r8 = 22220000
+	UHADD8	r4, r4, r5	; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
+	UXTB16	r5, r4			; r5 = __22__00
+	UXTB16	r4, r4, ROR #8		; r4 = __33__11
+	QADD16	r8, r8, r5		; r8 = xx22xx00
+	QADD16	r7, r7, r4		; r7 = xx33xx11
+	USAT16	r8, #8, r8		; r8 = __22__00
+	USAT16	r7, #8, r7		; r7 = __33__11
+	ORR	r8, r8, r7, LSL #8	; r8 = 33221100
+	STRD	r8, [r0], r3
+	BGT	ofrinter2_v6_lp
+	LDMFD	r13!,{r4-r9,PC}
+	ENDP
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_frag_copy_list_neon
+	EXPORT	oc_frag_recon_intra_neon
+	EXPORT	oc_frag_recon_inter_neon
+	EXPORT	oc_frag_recon_inter2_neon
+
+oc_frag_copy_list_neon PROC
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r7,r14}
+	CMP	r12, #1
+	LDRGE	r6, [r3]		; r6 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
+	BLT	ofcl_neon_end
+	; Stall (2 on Xscale)
+	LDR	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
+	; Stall (on XScale)
+	MOV	r7, r6			; Guarantee PLD points somewhere valid.
+ofcl_neon_lp
+	ADD	r4, r1, r6
+	VLD1.64	{D0}, [r4@64], r2
+	ADD	r5, r0, r6
+	VLD1.64	{D1}, [r4@64], r2
+	SUBS	r12, r12, #1
+	VLD1.64	{D2}, [r4@64], r2
+	LDRGT	r6, [r3,#4]!		; r6 = _fragis[fragii]
+	VLD1.64	{D3}, [r4@64], r2
+	LDRGT	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
+	VLD1.64	{D4}, [r4@64], r2
+	ADDGT	r7, r1, r6
+	VLD1.64	{D5}, [r4@64], r2
+	PLD	[r7]
+	VLD1.64	{D6}, [r4@64], r2
+	PLD	[r7, r2]
+	VLD1.64	{D7}, [r4@64]
+	PLD	[r7, r2, LSL #1]
+	VST1.64	{D0}, [r5@64], r2
+	ADDGT	r7, r7, r2, LSL #2
+	VST1.64	{D1}, [r5@64], r2
+	PLD	[r7, -r2]
+	VST1.64	{D2}, [r5@64], r2
+	PLD	[r7]
+	VST1.64	{D3}, [r5@64], r2
+	PLD	[r7, r2]
+	VST1.64	{D4}, [r5@64], r2
+	PLD	[r7, r2, LSL #1]
+	VST1.64	{D5}, [r5@64], r2
+	ADDGT	r7, r7, r2, LSL #2
+	VST1.64	{D6}, [r5@64], r2
+	PLD	[r7, -r2]
+	VST1.64	{D7}, [r5@64]
+	BGT	ofcl_neon_lp
+ofcl_neon_end
+	LDMFD	r13!,{r4-r7,PC}
+	ENDP
+
+oc_frag_recon_intra_neon PROC
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	MOV	r3, #128
+	VDUP.S16	Q0, r3
+	VLDMIA	r2,  {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q0
+	VQADD.S16	Q10,Q10,Q0
+	VQADD.S16	Q11,Q11,Q0
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q0
+	VQADD.S16	Q14,Q14,Q0
+	VQADD.S16	Q15,Q15,Q0
+	VQMOVUN.S16	D16,Q8	; D16= 7766554433221100		; 1 cycle
+	VQMOVUN.S16	D17,Q9	; D17= FFEEDDCCBBAA9988		; 1 cycle
+	VQMOVUN.S16	D18,Q10	; D18= NNMMLLKKJJIIHHGG		; 1 cycle
+	VST1.64	{D16},[r0@64], r1
+	VQMOVUN.S16	D19,Q11	; D19= VVUUTTSSRRQQPPOO		; 1 cycle
+	VST1.64	{D17},[r0@64], r1
+	VQMOVUN.S16	D20,Q12	; D20= ddccbbaaZZYYXXWW		; 1 cycle
+	VST1.64	{D18},[r0@64], r1
+	VQMOVUN.S16	D21,Q13	; D21= llkkjjiihhggffee		; 1 cycle
+	VST1.64	{D19},[r0@64], r1
+	VQMOVUN.S16	D22,Q14	; D22= ttssrrqqppoonnmm		; 1 cycle
+	VST1.64	{D20},[r0@64], r1
+	VQMOVUN.S16	D23,Q15	; D23= !!@@zzyyxxwwvvuu		; 1 cycle
+	VST1.64	{D21},[r0@64], r1
+	VST1.64	{D22},[r0@64], r1
+	VST1.64	{D23},[r0@64], r1
+	MOV	PC,R14
+	ENDP
+
+oc_frag_recon_inter_neon PROC
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src
+	; r2 =       int            _ystride
+	; r3 = const ogg_int16_t    _residue[64]
+	VLDMIA	r3, {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
+	VLD1.64	{D0}, [r1], r2
+	VLD1.64	{D2}, [r1], r2
+	VMOVL.U8	Q0, D0	; Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D4}, [r1], r2
+	VMOVL.U8	Q1, D2	; etc
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q8, Q8, Q0
+	VLD1.64	{D0}, [r1], r2
+	VQADD.S16	Q9, Q9, Q1
+	VLD1.64	{D2}, [r1], r2
+	VQADD.S16	Q10,Q10,Q2
+	VLD1.64	{D4}, [r1], r2
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q0, D0
+	VMOVL.U8	Q1, D2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q1
+	VQADD.S16	Q14,Q14,Q2
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0@64], r2
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0@64], r2
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0@64], r2
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0@64], r2
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0@64], r2
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0@64], r2
+	VST1.64	{D22},[r0@64], r2
+	VST1.64	{D23},[r0@64], r2
+	MOV	PC,R14
+	ENDP
+
+oc_frag_recon_inter2_neon PROC
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src1
+	; r2 = const unsigned char *_src2
+	; r3 =       int            _ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    _residue[64]
+	VLDMIA	r12,{D16-D31}
+	VLD1.64	{D0}, [r1], r3
+	VLD1.64	{D4}, [r2], r3
+	VLD1.64	{D1}, [r1], r3
+	VLD1.64	{D5}, [r2], r3
+	VHADD.U8	Q2, Q0, Q2	; Q2 = FFEEDDCCBBAA99887766554433221100
+	VLD1.64	{D2}, [r1], r3
+	VLD1.64	{D6}, [r2], r3
+	VMOVL.U8	Q0, D4		; Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q2, D5		; etc
+	VLD1.64	{D7}, [r2], r3
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q2
+	VLD1.64	{D0}, [r1], r3
+	VMOVL.U8	Q1, D6
+	VLD1.64	{D4}, [r2], r3
+	VMOVL.U8	Q3, D7
+	VLD1.64	{D1}, [r1], r3
+	VQADD.S16	Q10,Q10,Q1
+	VLD1.64	{D5}, [r2], r3
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D2}, [r1], r3
+	VHADD.U8	Q2, Q0, Q2
+	VLD1.64	{D6}, [r2], r3
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q0, D4
+	VLD1.64	{D7}, [r2], r3
+	VMOVL.U8	Q2, D5
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q2
+	VMOVL.U8	Q1, D6
+	VMOVL.U8	Q3, D7
+	VQADD.S16	Q14,Q14,Q1
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0@64], r3
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0@64], r3
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0@64], r3
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0@64], r3
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0@64], r3
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0@64], r3
+	VST1.64	{D22},[r0@64], r3
+	VST1.64	{D23},[r0@64], r3
+	MOV	PC,R14
+	ENDP
+ ]
+
+	END
--- a/media/libtheora/lib/arm/armidct.s
+++ b/media/libtheora/lib/arm/armidct.s
--- a/media/libtheora/lib/arm/armint.h
+++ b/media/libtheora/lib/arm/armint.h
@ -0,0 +1,126 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armint_H)
+# define _arm_armint_H (1)
+# include "../internal.h"
+
+# if defined(OC_ARM_ASM)
+
+#  if defined(__ARMEB__)
+#   error "Big-endian configurations are not supported by the ARM asm. " \
+ "Reconfigure with --disable-asm or undefine OC_ARM_ASM."
+#  endif
+
+#  define oc_state_accel_init oc_state_accel_init_arm
+/*This function is implemented entirely in asm, so it's helpful to pull out all
+   of the things that depend on structure offsets.
+  We reuse the function pointer with the wrong prototype, though.*/
+#  define oc_state_loop_filter_frag_rows(_state,_bv,_refi,_pli, \
+ _fragy0,_fragy_end) \
+  ((oc_loop_filter_frag_rows_arm_func) \
+   (_state)->opt_vtable.state_loop_filter_frag_rows)( \
+   (_state)->ref_frame_data[(_refi)],(_state)->ref_ystride[(_pli)], \
+   (_bv), \
+   (_state)->frags, \
+   (_state)->fplanes[(_pli)].froffset \
+   +(_fragy0)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
+   (_state)->fplanes[(_pli)].froffset \
+   +(_fragy_end)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
+   (_state)->fplanes[(_pli)].froffset, \
+   (_state)->fplanes[(_pli)].froffset+(_state)->fplanes[(_pli)].nfrags, \
+   (_state)->frag_buf_offs, \
+   (_state)->fplanes[(_pli)].nhfrags)
+/*For everything else the default vtable macros are fine.*/
+#  define OC_STATE_USE_VTABLE (1)
+# endif
+
+# include "../state.h"
+# include "armcpu.h"
+
+# if defined(OC_ARM_ASM)
+typedef void (*oc_loop_filter_frag_rows_arm_func)(
+ unsigned char *_ref_frame_data,int _ystride,signed char _bv[256],
+ const oc_fragment *_frags,ptrdiff_t _fragi0,ptrdiff_t _fragi0_end,
+ ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+void oc_state_accel_init_arm(oc_theora_state *_state);
+void oc_frag_copy_list_arm(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_arm(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_arm(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_arm(ogg_int16_t _y[64],ogg_uint16_t _dc);
+void oc_idct8x8_arm(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_frag_rows_arm(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+#  if defined(OC_ARM_ASM_EDSP)
+void oc_frag_copy_list_edsp(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+
+#   if defined(OC_ARM_ASM_MEDIA)
+void oc_frag_recon_intra_v6(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_v6(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_v6(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_v6(ogg_int16_t _y[64],ogg_uint16_t _dc);
+void oc_idct8x8_v6(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_v6(signed char *_bv,int _flimit);
+void oc_loop_filter_frag_rows_v6(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+#    if defined(OC_ARM_ASM_NEON)
+void oc_frag_copy_list_neon(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_neon(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_neon(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_neon(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_neon(ogg_int16_t _y[64],ogg_uint16_t _dc);
+void oc_idct8x8_neon(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_neon(signed char *_bv,int _flimit);
+void oc_loop_filter_frag_rows_neon(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+#    endif
+#   endif
+#  endif
+# endif
+
+#endif
--- a/media/libtheora/lib/arm/armloop.s
+++ b/media/libtheora/lib/arm/armloop.s
@ -0,0 +1,676 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+	EXPORT	oc_loop_filter_frag_rows_arm
+
+; Which bit this is depends on the order of packing within a bitfield.
+; Hopefully that doesn't change among any of the relevant compilers.
+OC_FRAG_CODED_FLAG	*	1
+
+	; Vanilla ARM v4 version
+loop_filter_h_arm PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfh_arm_lp
+	LDRB	r3, [r0, #-2]		; r3 = _pix[0]
+	LDRB	r12,[r0, #1]		; r12= _pix[3]
+	LDRB	r4, [r0, #-1]		; r4 = _pix[1]
+	LDRB	r5, [r0]		; r5 = _pix[2]
+	SUB	r3, r3, r12		; r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		; r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	; r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	; Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, #-1]
+	STRB	r5, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	lfh_arm_lp
+	SUB	r0, r0, r1, LSL #3
+	LDMFD	r13!,{r3-r6,PC}
+	ENDP
+
+loop_filter_v_arm PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfv_arm_lp
+	LDRB	r3, [r0, -r1, LSL #1]	; r3 = _pix[0]
+	LDRB	r12,[r0, r1]		; r12= _pix[3]
+	LDRB	r4, [r0, -r1]		; r4 = _pix[1]
+	LDRB	r5, [r0]		; r5 = _pix[2]
+	SUB	r3, r3, r12		; r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		; r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	; r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	; Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, -r1]
+	STRB	r5, [r0], #1
+	SUBS	r14,r14,#1
+	BGT	lfv_arm_lp
+	SUB	r0, r0, #8
+	LDMFD	r13!,{r3-r6,PC}
+	ENDP
+
+oc_loop_filter_frag_rows_arm PROC
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	ADD	r2, r2, #127	; _bv += 127
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_arm_end	;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_arm_end	;			  bail
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_arm_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_arm_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_arm_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_arm
+	CMP	r4, r6		; if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_arm
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_arm
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_arm
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		; r4 = fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+oslffri_arm_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_arm_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	ENDP
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_loop_filter_init_v6
+	EXPORT	oc_loop_filter_frag_rows_v6
+
+oc_loop_filter_init_v6 PROC
+	; r0 = _bv
+	; r1 = _flimit (=L from the spec)
+	MVN	r1, r1, LSL #1		; r1 = <0xFFFFFF|255-2*L>
+	AND	r1, r1, #255		; r1 = ll=r1&0xFF
+	ORR	r1, r1, r1, LSL #8	; r1 = <ll|ll>
+	PKHBT	r1, r1, r1, LSL #16	; r1 = <ll|ll|ll|ll>
+	STR	r1, [r0]
+	MOV	PC,r14
+	ENDP
+
+; We could use the same strategy as the v filter below, but that would require
+;  40 instructions to load the data and transpose it into columns and another
+;  32 to write out the results at the end, plus the 52 instructions to do the
+;  filtering itself.
+; This is slightly less, and less code, even assuming we could have shared the
+;  52 instructions in the middle with the other function.
+; It executes slightly fewer instructions than the ARMv6 approach David Conrad
+;  proposed for FFmpeg, but not by much:
+;  http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
+; His is a lot less code, though, because it only does two rows at once instead
+;  of four.
+loop_filter_h_v6 PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; preserves r0-r3
+	STMFD	r13!,{r4-r11,r14}
+	LDR	r12,=0x10003
+	BL loop_filter_h_core_v6
+	ADD	r0, r0, r1, LSL #2
+	BL loop_filter_h_core_v6
+	SUB	r0, r0, r1, LSL #2
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+loop_filter_h_core_v6 PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; r12= 0x10003
+	; Preserves r0-r3, r12; Clobbers r4-r11.
+	LDR	r4,[r0, #-2]!		; r4 = <p3|p2|p1|p0>
+	; Single issue
+	LDR	r5,[r0, r1]!		; r5 = <q3|q2|q1|q0>
+	UXTB16	r6, r4, ROR #16		; r6 = <p0|p2>
+	UXTB16	r4, r4, ROR #8		; r4 = <p3|p1>
+	UXTB16	r7, r5, ROR #16		; r7 = <q0|q2>
+	UXTB16	r5, r5, ROR #8		; r5 = <q3|q1>
+	PKHBT	r8, r4, r5, LSL #16	; r8 = <__|q1|__|p1>
+	PKHBT	r9, r6, r7, LSL #16	; r9 = <__|q2|__|p2>
+	SSUB16	r6, r4, r6		; r6 = <p3-p0|p1-p2>
+	SMLAD	r6, r6, r12,r12		; r6 = <????|(p3-p0)+3*(p1-p2)+3>
+	SSUB16	r7, r5, r7		; r7 = <q3-q0|q1-q2>
+	SMLAD	r7, r7, r12,r12		; r7 = <????|(q0-q3)+3*(q2-q1)+4>
+	LDR	r4,[r0, r1]!		; r4 = <r3|r2|r1|r0>
+	MOV	r6, r6, ASR #3		; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
+	LDR	r5,[r0, r1]!		; r5 = <s3|s2|s1|s0>
+	PKHBT	r11,r6, r7, LSL #13	; r11= <??|-R_q|??|-R_p>
+	UXTB16	r6, r4, ROR #16		; r6 = <r0|r2>
+	UXTB16	r11,r11			; r11= <__|-R_q|__|-R_p>
+	UXTB16	r4, r4, ROR #8		; r4 = <r3|r1>
+	UXTB16	r7, r5, ROR #16		; r7 = <s0|s2>
+	PKHBT	r10,r6, r7, LSL #16	; r10= <__|s2|__|r2>
+	SSUB16	r6, r4, r6		; r6 = <r3-r0|r1-r2>
+	UXTB16	r5, r5, ROR #8		; r5 = <s3|s1>
+	SMLAD	r6, r6, r12,r12		; r6 = <????|(r3-r0)+3*(r2-r1)+3>
+	SSUB16	r7, r5, r7		; r7 = <r3-r0|r1-r2>
+	SMLAD	r7, r7, r12,r12		; r7 = <????|(s0-s3)+3*(s2-s1)+4>
+	ORR	r9, r9, r10, LSL #8	; r9 = <s2|q2|r2|p2>
+	MOV	r6, r6, ASR #3		; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
+	PKHBT	r10,r4, r5, LSL #16	; r10= <__|s1|__|r1>
+	PKHBT	r6, r6, r7, LSL #13	; r6 = <??|-R_s|??|-R_r>
+	ORR	r8, r8, r10, LSL #8	; r8 = <s1|q1|r1|p1>
+	UXTB16	r6, r6			; r6 = <__|-R_s|__|-R_r>
+	MOV	r10,#0
+	ORR	r6, r11,r6, LSL #8	; r6 = <-R_s|-R_q|-R_r|-R_p>
+	; Single issue
+	; There's no min, max or abs instruction.
+	; SSUB8 and SEL will work for abs, and we can do all the rest with
+	;  unsigned saturated adds, which means the GE flags are still all
+	;  set when we're done computing lflim(abs(R_i),L).
+	; This allows us to both add and subtract, and split the results by
+	;  the original sign of R_i.
+	SSUB8	r7, r10,r6
+	; Single issue
+	SEL	r7, r7, r6		; r7 = abs(R_i)
+	; Single issue
+	UQADD8	r4, r7, r2		; r4 = 255-max(2*L-abs(R_i),0)
+	; Single issue
+	UQADD8	r7, r7, r4
+	; Single issue
+	UQSUB8	r7, r7, r4		; r7 = min(abs(R_i),max(2*L-abs(R_i),0))
+	; Single issue
+	UQSUB8	r4, r8, r7
+	UQADD8	r5, r9, r7
+	UQADD8	r8, r8, r7
+	UQSUB8	r9, r9, r7
+	SEL	r8, r8, r4		; r8 = p1+lflim(R_i,L)
+	SEL	r9, r9, r5		; r9 = p2-lflim(R_i,L)
+	MOV	r5, r9, LSR #24		; r5 = s2
+	STRB	r5, [r0,#2]!
+	MOV	r4, r8, LSR #24		; r4 = s1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #8		; r5 = r2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #8		; r4 = r1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #16		; r5 = q2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #16		; r4 = q1
+	STRB	r4, [r0,#-1]
+	; Single issue
+	STRB	r9, [r0,-r1]!
+	; Single issue
+	STRB	r8, [r0,#-1]
+	MOV	PC,r14
+	ENDP
+
+; This uses the same strategy as the MMXEXT version for x86, except that UHADD8
+;  computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
+; This works just as well, with the following procedure for computing the
+;  filter value, f:
+;   u = ~UHADD8(p1,~p2);
+;   v = UHADD8(~p1,p2);
+;   m = v-u;
+;   a = m^UHADD8(m^p0,m^~p3);
+;   f = UHADD8(UHADD8(a,u1),v1);
+;  where f = 127+R, with R in [-127,128] defined as in the spec.
+; This is exactly the same amount of arithmetic as the version that uses PAVGB
+;  as the basic operator.
+; It executes about 2/3 the number of instructions of David Conrad's approach,
+;  but requires more code, because it does all eight columns at once, instead
+;  of four at a time.
+loop_filter_v_v6 PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; preserves r0-r11
+	STMFD	r13!,{r4-r11,r14}
+	LDRD	r6, [r0, -r1]!		; r7, r6 = <p5|p1>
+	LDRD	r4, [r0, -r1]		; r5, r4 = <p4|p0>
+	LDRD	r8, [r0, r1]!		; r9, r8 = <p6|p2>
+	MVN	r14,r6			; r14= ~p1
+	LDRD	r10,[r0, r1]		; r11,r10= <p7|p3>
+	; Filter the first four columns.
+	MVN	r12,r8			; r12= ~p2
+	UHADD8	r14,r14,r8		; r14= v1=~p1+p2>>1
+	UHADD8	r12,r12,r6		; r12= p1+~p2>>1
+	MVN	r10, r10		; r10=~p3
+	MVN	r12,r12			; r12= u1=~p1+p2+1>>1
+	SSUB8	r14,r14,r12		; r14= m1=v1-u1
+	; Single issue
+	EOR	r4, r4, r14		; r4 = m1^p0
+	EOR	r10,r10,r14		; r10= m1^~p3
+	UHADD8	r4, r4, r10		; r4 = (m1^p0)+(m1^~p3)>>1
+	; Single issue
+	EOR	r4, r4, r14		; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
+	SADD8	r14,r14,r12		; r14= v1=m1+u1
+	UHADD8	r4, r4, r12		; r4 = a1+u1>>1
+	MVN	r12,r9			; r12= ~p6
+	UHADD8	r4, r4, r14		; r4 = f1=(a1+u1>>1)+v1>>1
+	; Filter the second four columns.
+	MVN	r14,r7			; r14= ~p5
+	UHADD8	r12,r12,r7		; r12= p5+~p6>>1
+	UHADD8	r14,r14,r9		; r14= v2=~p5+p6>>1
+	MVN	r12,r12			; r12= u2=~p5+p6+1>>1
+	MVN	r11,r11			; r11=~p7
+	SSUB8	r10,r14,r12		; r10= m2=v2-u2
+	; Single issue
+	EOR	r5, r5, r10		; r5 = m2^p4
+	EOR	r11,r11,r10		; r11= m2^~p7
+	UHADD8	r5, r5, r11		; r5 = (m2^p4)+(m2^~p7)>>1
+	; Single issue
+	EOR	r5, r5, r10		; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
+	; Single issue
+	UHADD8	r5, r5, r12		; r5 = a2+u2>>1
+	LDR	r12,=0x7F7F7F7F		; r12 = {127}x4
+	UHADD8	r5, r5, r14		; r5 = f2=(a2+u2>>1)+v2>>1
+	; Now split f[i] by sign.
+	; There's no min or max instruction.
+	; We could use SSUB8 and SEL, but this is just as many instructions and
+	;  dual issues more (for v7 without NEON).
+	UQSUB8	r10,r4, r12		; r10= R_i>0?R_i:0
+	UQSUB8	r4, r12,r4		; r4 = R_i<0?-R_i:0
+	UQADD8	r11,r10,r2		; r11= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r4, r2		; r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r10,r10,r11
+	UQADD8	r4, r4, r14
+	UQSUB8	r10,r10,r11		; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r4, r4, r14		; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQSUB8	r11,r5, r12		; r11= R_i>0?R_i:0
+	UQADD8	r6, r6, r10
+	UQSUB8	r8, r8, r10
+	UQSUB8	r5, r12,r5		; r5 = R_i<0?-R_i:0
+	UQSUB8	r6, r6, r4		; r6 = p1+lflim(R_i,L)
+	UQADD8	r8, r8, r4		; r8 = p2-lflim(R_i,L)
+	UQADD8	r10,r11,r2		; r10= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r5, r2		; r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r11,r11,r10
+	UQADD8	r5, r5, r14
+	UQSUB8	r11,r11,r10		; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r5, r5, r14		; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQADD8	r7, r7, r11
+	UQSUB8	r9, r9, r11
+	UQSUB8	r7, r7, r5		; r7 = p5+lflim(R_i,L)
+	STRD	r6, [r0, -r1]		; [p5:p1] = [r7: r6]
+	UQADD8	r9, r9, r5		; r9 = p6-lflim(R_i,L)
+	STRD	r8, [r0]		; [p6:p2] = [r9: r8]
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+oc_loop_filter_frag_rows_v6 PROC
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	LDR	r2, [r2]	; ll = *(int *)_bv
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_v6_end	;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_v6_end	;			  bail
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_v6_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_v6_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_v6_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_v6
+	CMP	r4, r6		; if (fragi0>_fragi_top)
+	BLGT	loop_filter_v_v6
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_v6
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_v6
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		; r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+oslffri_v6_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_v6_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		; r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	ENDP
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_loop_filter_init_neon
+	EXPORT	oc_loop_filter_frag_rows_neon
+
+oc_loop_filter_init_neon PROC
+	; r0 = _bv
+	; r1 = _flimit (=L from the spec)
+	MOV		r1, r1, LSL #1  ; r1 = 2*L
+	VDUP.S16	Q15, r1		; Q15= 2L in U16s
+	VST1.64		{D30,D31}, [r0@128]
+	MOV	PC,r14
+	ENDP
+
+loop_filter_h_neon PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	; We assume Q15= 2*L in U16s
+	;                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, #2
+	; Doing a 2-element structure load saves doing two VTRN's below, at the
+	;  cost of using two more slower single-lane loads vs. the faster
+	;  all-lane loads.
+	; It's less code this way, though, and benches a hair faster, but it
+	;  leaves D2 and D4 swapped.
+	VLD2.16	{D0[],D2[]},  [r12], r1		; D0 = ____________1100     2,1
+						; D2 = ____________3322
+	VLD2.16	{D4[],D6[]},  [r12], r1		; D4 = ____________5544     2,1
+						; D6 = ____________7766
+	VLD2.16	{D0[1],D2[1]},[r12], r1		; D0 = ________99881100     3,1
+						; D2 = ________BBAA3322
+	VLD2.16	{D4[1],D6[1]},[r12], r1		; D4 = ________DDCC5544     3,1
+						; D6 = ________FFEE7766
+	VLD2.16	{D0[2],D2[2]},[r12], r1		; D0 = ____GGHH99881100     3,1
+						; D2 = ____JJIIBBAA3322
+	VLD2.16	{D4[2],D6[2]},[r12], r1		; D4 = ____KKLLDDCC5544     3,1
+						; D6 = ____NNMMFFEE7766
+	VLD2.16	{D0[3],D2[3]},[r12], r1		; D0 = PPOOGGHH99881100     3,1
+						; D2 = RRQQJJIIBBAA3322
+	VLD2.16	{D4[3],D6[3]},[r12], r1		; D4 = TTSSKKLLDDCC5544     3,1
+						; D6 = VVUUNNMMFFEE7766
+	VTRN.8	D0, D4	; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511       1,1
+	VTRN.8	D2, D6	; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733       1,1
+	VSUBL.U8	Q0, D0, D6	; Q0 = 00 - 33 in S16s              1,3
+	VSUBL.U8	Q8, D2, D4	; Q8 = 22 - 11 in S16s              1,3
+	ADD	r12,r0, #8
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12]
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12,r1]
+	VADD.S16	Q0, Q0, Q8	; Q0 = [0-3]+3*[2-1]                1,3
+	PLD	[r12,r1, LSL #1]
+	VRSHR.S16	Q0, Q0, #3	; Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	ADD	r12,r12,r1, LSL #2
+	;  We want to do
+	; f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	;   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	; So we've reduced the left and right hand terms to be the same, except
+	; for a negation.
+	; Stall x3
+	VABS.S16	Q9, Q0		; Q9 = |f| in U16s                  1,4
+	PLD	[r12,-r1]
+	VSHR.S16	Q0, Q0, #15	; Q0 = -1 or 0 according to sign    1,3
+	PLD	[r12]
+	VQSUB.U16	Q10,Q15,Q9	; Q10= MAX(2L-|f|,0) in U16s        1,4
+	PLD	[r12,r1]
+	VMOVL.U8	Q1, D2	   ; Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	PLD	[r12,r1,LSL #1]
+	VMIN.U16	Q9, Q10,Q9	; Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	ADD	r12,r12,r1, LSL #2
+	; Now we need to correct for the sign of f.
+	; For negative elements of Q0, we want to subtract the appropriate
+	; element of Q9. For positive elements we want to add them. No NEON
+	; instruction exists to do this, so we need to negate the negative
+	; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	VADD.S16	Q9, Q9, Q0	;				    1,3
+	PLD	[r12,-r1]
+	VEOR.S16	Q9, Q9, Q0	; Q9 = real value of f              1,3
+	; Bah. No VRSBW.U8
+	; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D4, Q2		; D4 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D2, Q1		; D2 = UUQQMMIIEEAA6622		    1,1
+	SUB	r12,r0, #1
+	VTRN.8	D4, D2		; D4 = QQPPIIHHAA992211	D2 = MMLLEEDD6655   1,1
+	VST1.16	{D4[0]}, [r12], r1
+	VST1.16	{D2[0]}, [r12], r1
+	VST1.16	{D4[1]}, [r12], r1
+	VST1.16	{D2[1]}, [r12], r1
+	VST1.16	{D4[2]}, [r12], r1
+	VST1.16	{D2[2]}, [r12], r1
+	VST1.16	{D4[3]}, [r12], r1
+	VST1.16	{D2[3]}, [r12], r1
+	MOV	PC,r14
+	ENDP
+
+loop_filter_v_neon PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	; We assume Q15= 2*L in U16s
+	;                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, r1, LSL #1
+	VLD1.64	{D0}, [r12@64], r1		; D0 = SSOOKKGGCC884400     2,1
+	VLD1.64	{D2}, [r12@64], r1		; D2 = TTPPLLHHDD995511     2,1
+	VLD1.64	{D4}, [r12@64], r1		; D4 = UUQQMMIIEEAA6622     2,1
+	VLD1.64	{D6}, [r12@64]			; D6 = VVRRNNJJFFBB7733     2,1
+	VSUBL.U8	Q8, D4, D2	; Q8 = 22 - 11 in S16s              1,3
+	VSUBL.U8	Q0, D0, D6	; Q0 = 00 - 33 in S16s              1,3
+	ADD	r12, #8
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12]
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12,r1]
+	VADD.S16	Q0, Q0, Q8	; Q0 = [0-3]+3*[2-1]                1,3
+	SUB	r12, r0, r1
+	VRSHR.S16	Q0, Q0, #3	; Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	;  We want to do
+	; f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	;   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	; So we've reduced the left and right hand terms to be the same, except
+	; for a negation.
+	; Stall x3
+	VABS.S16	Q9, Q0		; Q9 = |f| in U16s                  1,4
+	VSHR.S16	Q0, Q0, #15	; Q0 = -1 or 0 according to sign    1,3
+	; Stall x2
+	VQSUB.U16	Q10,Q15,Q9	; Q10= MAX(2L-|f|,0) in U16s        1,4
+	VMOVL.U8	Q2, D4	   ; Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	; Stall x2
+	VMIN.U16	Q9, Q10,Q9	; Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	; Now we need to correct for the sign of f.
+	; For negative elements of Q0, we want to subtract the appropriate
+	; element of Q9. For positive elements we want to add them. No NEON
+	; instruction exists to do this, so we need to negate the negative
+	; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	; Stall x3
+	VADD.S16	Q9, Q9, Q0	;				    1,3
+	; Stall x2
+	VEOR.S16	Q9, Q9, Q0	; Q9 = real value of f              1,3
+	; Bah. No VRSBW.U8
+	; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D2, Q1		; D2 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D4, Q2		; D4 = UUQQMMIIEEAA6622		    1,1
+	VST1.64	{D2}, [r12@64], r1
+	VST1.64	{D4}, [r12@64], r1
+	MOV	PC,r14
+	ENDP
+
+oc_loop_filter_frag_rows_neon PROC
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_neon_end;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_neon_end	;		  bail
+	VLD1.64	{D30,D31}, [r2@128]	; Q15= 2L in U16s
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_neon_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_neon_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_neon_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_neon
+	CMP	r4, r6		; if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_neon
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_neon
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_neon
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+oslffri_neon_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_neon_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	ENDP
+ ]
+
+	END
--- a/media/libtheora/lib/arm/armopts.s
+++ b/media/libtheora/lib/arm/armopts.s
@ -0,0 +1,39 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: armopts.s.in 17430 2010-09-22 21:54:09Z tterribe $
+;********************************************************************
+
+; Set the following to 1 if we have EDSP instructions
+;  (LDRD/STRD, etc., ARMv5E and later).
+OC_ARM_ASM_EDSP		*	1
+
+; Set the following to 1 if we have ARMv6 media instructions.
+OC_ARM_ASM_MEDIA	*	1
+
+; Set the following to 1 if we have NEON (some ARMv7)
+OC_ARM_ASM_NEON		*	1
+
+; Set the following to 1 if LDR/STR can work on unaligned addresses
+; This is assumed to be true for ARMv6 and later code
+OC_ARM_CAN_UNALIGN	*	0
+
+; Large unaligned loads and stores are often configured to cause an exception.
+; They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store)
+;  boundary, so it's usually a bad idea to use them anyway if they can be
+;  avoided.
+
+; Set the following to 1 if LDRD/STRD can work on unaligned addresses
+OC_ARM_CAN_UNALIGN_LDRD	*	0
+
+	END
--- a/media/libtheora/lib/arm/armstate.c
+++ b/media/libtheora/lib/arm/armstate.c
@ -0,0 +1,219 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#include "armint.h"
+
+#if defined(OC_ARM_ASM)
+
+# if defined(OC_ARM_ASM_NEON)
+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
+   the destination.*/
+static const unsigned char OC_FZIG_ZAG_NEON[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3, 4,11,18,25,32,40,
+  33,26,19,12, 5, 6,13,20,
+  27,34,41,48,56,49,42,35,
+  28,21,14, 7,15,22,29,36,
+  43,50,57,58,51,44,37,30,
+  23,31,38,45,52,59,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+# endif
+
+void oc_state_accel_init_arm(oc_theora_state *_state){
+  oc_state_accel_init_c(_state);
+  _state->cpu_flags=oc_cpu_flags_get();
+# if defined(OC_STATE_USE_VTABLE)
+  _state->opt_vtable.frag_copy_list=oc_frag_copy_list_arm;
+  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_arm;
+  _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm;
+  _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm;
+  _state->opt_vtable.idct8x8=oc_idct8x8_arm;
+  _state->opt_vtable.state_frag_recon=oc_state_frag_recon_arm;
+  /*Note: We _must_ set this function pointer, because the macro in armint.h
+     calls it with different arguments, so the C version will segfault.*/
+  _state->opt_vtable.state_loop_filter_frag_rows=
+   (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_arm;
+# endif
+# if defined(OC_ARM_ASM_EDSP)
+  if(_state->cpu_flags&OC_CPU_ARM_EDSP){
+#  if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_edsp;
+#  endif
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  if(_state->cpu_flags&OC_CPU_ARM_MEDIA){
+#   if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_v6;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_v6;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_v6;
+    _state->opt_vtable.idct8x8=oc_idct8x8_v6;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_v6;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_v6;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_v6;
+#   endif
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  if(_state->cpu_flags&OC_CPU_ARM_NEON){
+#    if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_neon;
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_neon;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_neon;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_neon;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_neon;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_neon;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_neon;
+    _state->opt_vtable.idct8x8=oc_idct8x8_neon;
+#    endif
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_NEON;
+  }
+#   endif
+#  endif
+# endif
+}
+
+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            refi;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_arm(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_arm(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  refi=_state->frags[_fragi].refi;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_arm(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_arm(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+
+# if defined(OC_ARM_ASM_MEDIA)
+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            refi;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_v6(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_v6(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  refi=_state->frags[_fragi].refi;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_v6(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_v6(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+
+# if defined(OC_ARM_ASM_NEON)
+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            refi;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_neon(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_neon(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  refi=_state->frags[_fragi].refi;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_neon(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_neon(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+#  endif
+# endif
+
+#endif
--- a/media/libtheora/lib/bitpack.c
+++ b/media/libtheora/lib/bitpack.c
@ -11,7 +11,7 @@
 ********************************************************************

  function: packing variable sized words into an octet stream
-  last mod: $Id: bitpack.c 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id: bitpack.c 17410 2010-09-21 21:53:48Z tterribe $

 ********************************************************************/
 #include <string.h>
@ -32,15 +32,18 @@ static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
  const unsigned char *stop;
  oc_pb_window         window;
  int                  available;
+  unsigned             shift;
+  stop=_b->stop;
+  ptr=_b->ptr;
  window=_b->window;
  available=_b->bits;
-  ptr=_b->ptr;
-  stop=_b->stop;
-  while(available<=OC_PB_WINDOW_SIZE-8&&ptr<stop){
-    available+=8;
-    window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
+  shift=OC_PB_WINDOW_SIZE-available;
+  while(7<shift&&ptr<stop){
+    shift-=8;
+    window|=(oc_pb_window)*ptr++<<shift;
  }
  _b->ptr=ptr;
+  available=OC_PB_WINDOW_SIZE-shift;
  if(_bits>available){
    if(ptr>=stop){
      _b->eof=1;
@ -67,7 +70,7 @@ void oc_pack_adv1(oc_pack_buf *_b){
 }

 /*Here we assume that 0<=_bits&&_bits<=32.*/
-long oc_pack_read(oc_pack_buf *_b,int _bits){
+long oc_pack_read_c(oc_pack_buf *_b,int _bits){
  oc_pb_window window;
  int          available;
  long         result;
@ -82,12 +85,12 @@ long oc_pack_read(oc_pack_buf *_b,int _bits){
  available-=_bits;
  window<<=1;
  window<<=_bits-1;
-  _b->bits=available;
  _b->window=window;
+  _b->bits=available;
  return result;
 }

-int oc_pack_read1(oc_pack_buf *_b){
+int oc_pack_read1_c(oc_pack_buf *_b){
  oc_pb_window window;
  int          available;
  int          result;
@ -100,8 +103,8 @@ int oc_pack_read1(oc_pack_buf *_b){
  result=window>>OC_PB_WINDOW_SIZE-1;
  available--;
  window<<=1;
-  _b->bits=available;
  _b->window=window;
+  _b->bits=available;
  return result;
 }

--- a/media/libtheora/lib/bitpack.h
+++ b/media/libtheora/lib/bitpack.h
@ -16,15 +16,32 @@
 ********************************************************************/
 #if !defined(_bitpack_H)
 # define _bitpack_H (1)
+# include <stddef.h>
 # include <limits.h>
+# include "internal.h"



-typedef unsigned long      oc_pb_window;
+typedef size_t             oc_pb_window;
 typedef struct oc_pack_buf oc_pack_buf;



+/*Custom bitpacker implementations.*/
+# if defined(OC_ARM_ASM)
+#  include "arm/armbits.h"
+# endif
+
+# if !defined(oc_pack_read)
+#  define oc_pack_read oc_pack_read_c
+# endif
+# if !defined(oc_pack_read1)
+#  define oc_pack_read1 oc_pack_read1_c
+# endif
+# if !defined(oc_huff_token_decode)
+#  define oc_huff_token_decode oc_huff_token_decode_c
+# endif
+
 # define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT)
 /*This is meant to be a large, positive constant that can still be efficiently
   loaded as an immediate (on platforms like ARM, for example).
@ -34,9 +51,9 @@ typedef struct oc_pack_buf oc_pack_buf;


 struct oc_pack_buf{
-  oc_pb_window         window;
-  const unsigned char *ptr;
  const unsigned char *stop;
+  const unsigned char *ptr;
+  oc_pb_window         window;
  int                  bits;
  int                  eof;
 };
@ -45,8 +62,8 @@ void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes);
 int oc_pack_look1(oc_pack_buf *_b);
 void oc_pack_adv1(oc_pack_buf *_b);
 /*Here we assume 0<=_bits&&_bits<=32.*/
-long oc_pack_read(oc_pack_buf *_b,int _bits);
-int oc_pack_read1(oc_pack_buf *_b);
+long oc_pack_read_c(oc_pack_buf *_b,int _bits);
+int oc_pack_read1_c(oc_pack_buf *_b);
 /* returns -1 for read beyond EOF, or the number of whole bytes available */
 long oc_pack_bytes_left(oc_pack_buf *_b);

--- a/media/libtheora/lib/config.h
+++ b/media/libtheora/lib/config.h
@ -32,7 +32,7 @@
 #define HAVE_STRING_H 1

 /* Define to 1 if you have the <sys/soundcard.h> header file. */
-/* #undef HAVE_SYS_SOUNDCARD_H */
+#define HAVE_SYS_SOUNDCARD_H 1

 /* Define to 1 if you have the <sys/stat.h> header file. */
 #define HAVE_SYS_STAT_H 1
@ -43,18 +43,29 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1

-/* Define to the sub-directory in which libtool stores uninstalled libraries.
-   */
-#define LT_OBJDIR ".libs/"
-
 /* Define to 1 if your C compiler doesn't accept -c and -o together. */
 /* #undef NO_MINUS_C_MINUS_O */

+/* make use of arm asm optimization */
+ 
+
+/* Define if assembler supports EDSP instructions */
+
+
+/* Define if assembler supports ARMv6 media instructions */
+
+
+/* Define if compiler supports NEON instructions */
+
+
+/* make use of c64x+ asm optimization */
+/* #undef OC_C64X_ASM */
+
 /* make use of x86_64 asm optimization */
 /* #undef OC_X86_64_ASM */

 /* make use of x86 asm optimization */
- /**/
+/* #undef OC_X86_ASM */

 /* Name of package */
 #define PACKAGE "libtheora"
@ -66,16 +77,13 @@
 #define PACKAGE_NAME "libtheora"

 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "libtheora 1.1.1+svn"
+#define PACKAGE_STRING "libtheora 1.2.0alpha1+svn"

 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "libtheora"

-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.1.1+svn"
+#define PACKAGE_VERSION "1.2.0alpha1+svn"

 /* Define to 1 if you have the ANSI C header files. */
 #define STDC_HEADERS 1
@ -87,4 +95,4 @@
 /* #undef THEORA_DISABLE_FLOAT */

 /* Version number of package */
-#define VERSION "1.1.1+svn"
+#define VERSION "1.2.0alpha1+svn"
--- a/media/libtheora/lib/decinfo.c
+++ b/media/libtheora/lib/decinfo.c
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: decinfo.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: decinfo.c 17276 2010-06-05 05:57:05Z tterribe $

 ********************************************************************/

@ -128,6 +128,10 @@ static int oc_comment_unpack(oc_pack_buf *_opb,th_comment *_tc){
   _tc->comments*sizeof(_tc->comment_lengths[0]));
  _tc->user_comments=(char **)_ogg_malloc(
   _tc->comments*sizeof(_tc->user_comments[0]));
+  if(_tc->comment_lengths==NULL||_tc->user_comments==NULL){
+    _tc->comments=0;
+    return TH_EFAULT;
+  }
  for(i=0;i<_tc->comments;i++){
    len=oc_unpack_length(_opb);
    if(len<0||len>oc_pack_bytes_left(_opb)){
--- a/media/libtheora/lib/decint.h
+++ b/media/libtheora/lib/decint.h
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: decint.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: decint.h 17457 2010-09-24 02:05:49Z tterribe $

 ********************************************************************/

@ -19,15 +19,39 @@
 #if !defined(_decint_H)
 # define _decint_H (1)
 # include "theora/theoradec.h"
-# include "internal.h"
+# include "state.h"
 # include "bitpack.h"
-
-typedef struct th_setup_info oc_setup_info;
-typedef struct th_dec_ctx    oc_dec_ctx;
-
 # include "huffdec.h"
 # include "dequant.h"

+typedef struct th_setup_info         oc_setup_info;
+typedef struct oc_dec_opt_vtable     oc_dec_opt_vtable;
+typedef struct oc_dec_pipeline_state oc_dec_pipeline_state;
+typedef struct th_dec_ctx            oc_dec_ctx;
+
+
+
+/*Decoder-specific accelerated functions.*/
+# if defined(OC_C64X_ASM)
+#  include "c64x/c64xdec.h"
+# endif
+
+# if !defined(oc_dec_accel_init)
+#  define oc_dec_accel_init oc_dec_accel_init_c
+# endif
+# if defined(OC_DEC_USE_VTABLE)
+#  if !defined(oc_dec_dc_unpredict_mcu_plane)
+#   define oc_dec_dc_unpredict_mcu_plane(_dec,_pipe,_pli) \
+ ((*(_dec)->opt_vtable.dc_unpredict_mcu_plane)(_dec,_pipe,_pli))
+#  endif
+# else
+#  if !defined(oc_dec_dc_unpredict_mcu_plane)
+#   define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c
+#  endif
+# endif
+
+
+
 /*Constants for the packet-in state machine specific to the decoder.*/

 /*Next packet to read: Data packet.*/
@ -37,71 +61,126 @@ typedef struct th_dec_ctx    oc_dec_ctx;

 struct th_setup_info{
  /*The Huffman codes.*/
-  oc_huff_node      *huff_tables[TH_NHUFFMAN_TABLES];
+  ogg_int16_t   *huff_tables[TH_NHUFFMAN_TABLES];
  /*The quantization parameters.*/
  th_quant_info  qinfo;
 };



+/*Decoder specific functions with accelerated variants.*/
+struct oc_dec_opt_vtable{
+  void (*dc_unpredict_mcu_plane)(oc_dec_ctx *_dec,
+   oc_dec_pipeline_state *_pipe,int _pli);
+};
+
+
+
+struct oc_dec_pipeline_state{
+  /*Decoded DCT coefficients.
+    These are placed here instead of on the stack so that they can persist
+     between blocks, which makes clearing them back to zero much faster when
+     only a few non-zero coefficients were decoded.
+    It requires at least 65 elements because the zig-zag index array uses the
+     65th element as a dumping ground for out-of-range indices to protect us
+     from buffer overflow.
+    We make it fully twice as large so that the second half can serve as the
+     reconstruction buffer, which saves passing another parameter to all the
+     acceleration functios.
+    It also solves problems with 16-byte alignment for NEON on ARM.
+    gcc (as of 4.2.1) only seems to be able to give stack variables 8-byte
+     alignment, and silently produces incorrect results if you ask for 16.
+    Finally, keeping it off the stack means there's less likely to be a data
+     hazard beween the NEON co-processor and the regular ARM core, which avoids
+     unnecessary stalls.*/
+  OC_ALIGN16(ogg_int16_t dct_coeffs[128]);
+  OC_ALIGN16(signed char bounding_values[256]);
+  ptrdiff_t           ti[3][64];
+  ptrdiff_t           ebi[3][64];
+  ptrdiff_t           eob_runs[3][64];
+  const ptrdiff_t    *coded_fragis[3];
+  const ptrdiff_t    *uncoded_fragis[3];
+  ptrdiff_t           ncoded_fragis[3];
+  ptrdiff_t           nuncoded_fragis[3];
+  const ogg_uint16_t *dequant[3][3][2];
+  int                 fragy0[3];
+  int                 fragy_end[3];
+  int                 pred_last[3][4];
+  int                 mcu_nvfrags;
+  int                 loop_filter;
+  int                 pp_level;
+};
+
+
 struct th_dec_ctx{
  /*Shared encoder/decoder state.*/
-  oc_theora_state      state;
+  oc_theora_state        state;
  /*Whether or not packets are ready to be emitted.
    This takes on negative values while there are remaining header packets to
     be emitted, reaches 0 when the codec is ready for input, and goes to 1
     when a frame has been processed and a data packet is ready.*/
-  int                  packet_state;
+  int                    packet_state;
  /*Buffer in which to assemble packets.*/
-  oc_pack_buf          opb;
+  oc_pack_buf            opb;
  /*Huffman decode trees.*/
-  oc_huff_node        *huff_tables[TH_NHUFFMAN_TABLES];
+  ogg_int16_t           *huff_tables[TH_NHUFFMAN_TABLES];
  /*The index of the first token in each plane for each coefficient.*/
-  ptrdiff_t            ti0[3][64];
+  ptrdiff_t              ti0[3][64];
  /*The number of outstanding EOB runs at the start of each coefficient in each
     plane.*/
-  ptrdiff_t            eob_runs[3][64];
+  ptrdiff_t              eob_runs[3][64];
  /*The DCT token lists.*/
-  unsigned char       *dct_tokens;
+  unsigned char         *dct_tokens;
  /*The extra bits associated with DCT tokens.*/
-  unsigned char       *extra_bits;
+  unsigned char         *extra_bits;
  /*The number of dct tokens unpacked so far.*/
-  int                  dct_tokens_count;
+  int                    dct_tokens_count;
  /*The out-of-loop post-processing level.*/
-  int                  pp_level;
+  int                    pp_level;
  /*The DC scale used for out-of-loop deblocking.*/
-  int                  pp_dc_scale[64];
+  int                    pp_dc_scale[64];
  /*The sharpen modifier used for out-of-loop deringing.*/
-  int                  pp_sharp_mod[64];
+  int                    pp_sharp_mod[64];
  /*The DC quantization index of each block.*/
-  unsigned char       *dc_qis;
+  unsigned char         *dc_qis;
  /*The variance of each block.*/
-  int                 *variances;
+  int                   *variances;
  /*The storage for the post-processed frame buffer.*/
-  unsigned char       *pp_frame_data;
+  unsigned char         *pp_frame_data;
  /*Whether or not the post-processsed frame buffer has space for chroma.*/
-  int                  pp_frame_state;
+  int                    pp_frame_state;
  /*The buffer used for the post-processed frame.
    Note that this is _not_ guaranteed to have the same strides and offsets as
     the reference frame buffers.*/
-  th_ycbcr_buffer      pp_frame_buf;
+  th_ycbcr_buffer        pp_frame_buf;
  /*The striped decode callback function.*/
-  th_stripe_callback   stripe_cb;
+  th_stripe_callback     stripe_cb;
+  oc_dec_pipeline_state  pipe;
+# if defined(OC_DEC_USE_VTABLE)
+  /*Table for decoder acceleration functions.*/
+  oc_dec_opt_vtable      opt_vtable;
+# endif
 # if defined(HAVE_CAIRO)
  /*Output metrics for debugging.*/
-  int                  telemetry;
-  int                  telemetry_mbmode;
-  int                  telemetry_mv;
-  int                  telemetry_qi;
-  int                  telemetry_bits;
-  int                  telemetry_frame_bytes;
-  int                  telemetry_coding_bytes;
-  int                  telemetry_mode_bytes;
-  int                  telemetry_mv_bytes;
-  int                  telemetry_qi_bytes;
-  int                  telemetry_dc_bytes;
-  unsigned char       *telemetry_frame_data;
+  int                    telemetry;
+  int                    telemetry_mbmode;
+  int                    telemetry_mv;
+  int                    telemetry_qi;
+  int                    telemetry_bits;
+  int                    telemetry_frame_bytes;
+  int                    telemetry_coding_bytes;
+  int                    telemetry_mode_bytes;
+  int                    telemetry_mv_bytes;
+  int                    telemetry_qi_bytes;
+  int                    telemetry_dc_bytes;
+  unsigned char         *telemetry_frame_data;
 # endif
 };

+/*Default pure-C implementations of decoder-specific accelerated functions.*/
+void oc_dec_accel_init_c(oc_dec_ctx *_dec);
+
+void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli);
+
 #endif
--- a/media/libtheora/lib/decode.c
+++ b/media/libtheora/lib/decode.c
--- a/media/libtheora/lib/encint.h
+++ b/media/libtheora/lib/encint.h
@ -1,493 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: encint.h 16503 2009-08-22 18:14:02Z giles $
-
- ********************************************************************/
-#if !defined(_encint_H)
-# define _encint_H (1)
-# if defined(HAVE_CONFIG_H)
-#  include "config.h"
-# endif
-# include "theora/theoraenc.h"
-# include "internal.h"
-# include "ocintrin.h"
-# include "mathops.h"
-# include "enquant.h"
-# include "huffenc.h"
-/*# define OC_COLLECT_METRICS*/
-
-
-
-typedef oc_mv                         oc_mv2[2];
-
-typedef struct oc_enc_opt_vtable      oc_enc_opt_vtable;
-typedef struct oc_mb_enc_info         oc_mb_enc_info;
-typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
-typedef struct oc_iir_filter          oc_iir_filter;
-typedef struct oc_frame_metrics       oc_frame_metrics;
-typedef struct oc_rc_state            oc_rc_state;
-typedef struct th_enc_ctx             oc_enc_ctx;
-typedef struct oc_token_checkpoint    oc_token_checkpoint;
-
-
-
-/*Constants for the packet-out state machine specific to the encoder.*/
-
-/*Next packet to emit: Data packet, but none are ready yet.*/
-#define OC_PACKET_EMPTY (0)
-/*Next packet to emit: Data packet, and one is ready.*/
-#define OC_PACKET_READY (1)
-
-/*All features enabled.*/
-#define OC_SP_LEVEL_SLOW       (0)
-/*Enable early skip.*/
-#define OC_SP_LEVEL_EARLY_SKIP (1)
-/*Disable motion compensation.*/
-#define OC_SP_LEVEL_NOMC       (2)
-/*Maximum valid speed level.*/
-#define OC_SP_LEVEL_MAX        (2)
-
-
-/*The bits used for each of the MB mode codebooks.*/
-extern const unsigned char OC_MODE_BITS[2][OC_NMODES];
-
-/*The bits used for each of the MV codebooks.*/
-extern const unsigned char OC_MV_BITS[2][64];
-
-/*The minimum value that can be stored in a SB run for each codeword.
-  The last entry is the upper bound on the length of a single SB run.*/
-extern const ogg_uint16_t  OC_SB_RUN_VAL_MIN[8];
-/*The bits used for each SB run codeword.*/
-extern const unsigned char OC_SB_RUN_CODE_NBITS[7];
-
-/*The bits used for each block run length (starting with 1).*/
-extern const unsigned char OC_BLOCK_RUN_CODE_NBITS[30];
-
-
-
-/*Encoder specific functions with accelerated variants.*/
-struct oc_enc_opt_vtable{
-  unsigned (*frag_sad)(const unsigned char *_src,
-   const unsigned char *_ref,int _ystride);
-  unsigned (*frag_sad_thresh)(const unsigned char *_src,
-   const unsigned char *_ref,int _ystride,unsigned _thresh);
-  unsigned (*frag_sad2_thresh)(const unsigned char *_src,
-   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
-   unsigned _thresh);
-  unsigned (*frag_satd_thresh)(const unsigned char *_src,
-   const unsigned char *_ref,int _ystride,unsigned _thresh);
-  unsigned (*frag_satd2_thresh)(const unsigned char *_src,
-   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
-   unsigned _thresh);
-  unsigned (*frag_intra_satd)(const unsigned char *_src,int _ystride);
-  void     (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
-   const unsigned char *_ref,int _ystride);
-  void     (*frag_sub_128)(ogg_int16_t _diff[64],
-   const unsigned char *_src,int _ystride);
-  void     (*frag_copy2)(unsigned char *_dst,
-   const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-  void     (*frag_recon_intra)(unsigned char *_dst,int _ystride,
-   const ogg_int16_t _residue[64]);
-  void     (*frag_recon_inter)(unsigned char *_dst,
-   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
-  void     (*fdct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-};
-
-
-void oc_enc_vtable_init(oc_enc_ctx *_enc);
-
-
-
-/*Encoder-specific macroblock information.*/
-struct oc_mb_enc_info{
-  /*Neighboring macro blocks that have MVs available from the current frame.*/
-  unsigned      cneighbors[4];
-  /*Neighboring macro blocks to use for MVs from the previous frame.*/
-  unsigned      pneighbors[4];
-  /*The number of current-frame neighbors.*/
-  unsigned char ncneighbors;
-  /*The number of previous-frame neighbors.*/
-  unsigned char npneighbors;
-  /*Flags indicating which MB modes have been refined.*/
-  unsigned char refined;
-  /*Motion vectors for a macro block for the current frame and the
-     previous two frames.
-    Each is a set of 2 vectors against OC_FRAME_GOLD and OC_FRAME_PREV, which
-     can be used to estimate constant velocity and constant acceleration
-     predictors.
-    Uninitialized MVs are (0,0).*/
-  oc_mv2        analysis_mv[3];
-  /*Current unrefined analysis MVs.*/
-  oc_mv         unref_mv[2];
-  /*Unrefined block MVs.*/
-  oc_mv         block_mv[4];
-  /*Refined block MVs.*/
-  oc_mv         ref_mv[4];
-  /*Minimum motion estimation error from the analysis stage.*/
-  ogg_uint16_t  error[2];
-  /*MB error for half-pel refinement for each frame type.*/
-  unsigned      satd[2];
-  /*Block error for half-pel refinement.*/
-  unsigned      block_satd[4];
-};
-
-
-
-/*State machine to estimate the opportunity cost of coding a MB mode.*/
-struct oc_mode_scheme_chooser{
-  /*Pointers to the a list containing the index of each mode in the mode
-     alphabet used by each scheme.
-    The first entry points to the dynamic scheme0_ranks, while the remaining 7
-     point to the constant entries stored in OC_MODE_SCHEMES.*/
-  const unsigned char *mode_ranks[8];
-  /*The ranks for each mode when coded with scheme 0.
-    These are optimized so that the more frequent modes have lower ranks.*/
-  unsigned char        scheme0_ranks[OC_NMODES];
-  /*The list of modes, sorted in descending order of frequency, that
-    corresponds to the ranks above.*/
-  unsigned char        scheme0_list[OC_NMODES];
-  /*The number of times each mode has been chosen so far.*/
-  int                  mode_counts[OC_NMODES];
-  /*The list of mode coding schemes, sorted in ascending order of bit cost.*/
-  unsigned char        scheme_list[8];
-  /*The number of bits used by each mode coding scheme.*/
-  ptrdiff_t            scheme_bits[8];
-};
-
-
-void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser);
-
-
-
-/*A 2nd order low-pass Bessel follower.
-  We use this for rate control because it has fast reaction time, but is
-   critically damped.*/
-struct oc_iir_filter{
-  ogg_int32_t c[2];
-  ogg_int64_t g;
-  ogg_int32_t x[2];
-  ogg_int32_t y[2];
-};
-
-
-
-/*The 2-pass metrics associated with a single frame.*/
-struct oc_frame_metrics{
-  /*The log base 2 of the scale factor for this frame in Q24 format.*/
-  ogg_int32_t   log_scale;
-  /*The number of application-requested duplicates of this frame.*/
-  unsigned      dup_count:31;
-  /*The frame type from pass 1.*/
-  unsigned      frame_type:1;
-};
-
-
-
-/*Rate control state information.*/
-struct oc_rc_state{
-  /*The target average bits per frame.*/
-  ogg_int64_t        bits_per_frame;
-  /*The current buffer fullness (bits available to be used).*/
-  ogg_int64_t        fullness;
-  /*The target buffer fullness.
-    This is where we'd like to be by the last keyframe the appears in the next
-     buf_delay frames.*/
-  ogg_int64_t        target;
-  /*The maximum buffer fullness (total size of the buffer).*/
-  ogg_int64_t        max;
-  /*The log of the number of pixels in a frame in Q57 format.*/
-  ogg_int64_t        log_npixels;
-  /*The exponent used in the rate model in Q8 format.*/
-  unsigned           exp[2];
-  /*The number of frames to distribute the buffer usage over.*/
-  int                buf_delay;
-  /*The total drop count from the previous frame.
-    This includes duplicates explicitly requested via the
-     TH_ENCCTL_SET_DUP_COUNT API as well as frames we chose to drop ourselves.*/
-  ogg_uint32_t       prev_drop_count;
-  /*The log of an estimated scale factor used to obtain the real framerate, for
-     VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/
-  ogg_int64_t        log_drop_scale;
-  /*The log of estimated scale factor for the rate model in Q57 format.*/
-  ogg_int64_t        log_scale[2];
-  /*The log of the target quantizer level in Q57 format.*/
-  ogg_int64_t        log_qtarget;
-  /*Will we drop frames to meet bitrate target?*/
-  unsigned char      drop_frames;
-  /*Do we respect the maximum buffer fullness?*/
-  unsigned char      cap_overflow;
-  /*Can the reservoir go negative?*/
-  unsigned char      cap_underflow;
-  /*Second-order lowpass filters to track scale and VFR.*/
-  oc_iir_filter      scalefilter[2];
-  int                inter_count;
-  int                inter_delay;
-  int                inter_delay_target;
-  oc_iir_filter      vfrfilter;
-  /*Two-pass mode state.
-    0 => 1-pass encoding.
-    1 => 1st pass of 2-pass encoding.
-    2 => 2nd pass of 2-pass encoding.*/
-  int                twopass;
-  /*Buffer for current frame metrics.*/
-  unsigned char      twopass_buffer[48];
-  /*The number of bytes in the frame metrics buffer.
-    When 2-pass encoding is enabled, this is set to 0 after each frame is
-     submitted, and must be non-zero before the next frame will be accepted.*/
-  int                twopass_buffer_bytes;
-  int                twopass_buffer_fill;
-  /*Whether or not to force the next frame to be a keyframe.*/
-  unsigned char      twopass_force_kf;
-  /*The metrics for the previous frame.*/
-  oc_frame_metrics   prev_metrics;
-  /*The metrics for the current frame.*/
-  oc_frame_metrics   cur_metrics;
-  /*The buffered metrics for future frames.*/
-  oc_frame_metrics  *frame_metrics;
-  int                nframe_metrics;
-  int                cframe_metrics;
-  /*The index of the current frame in the circular metric buffer.*/
-  int                frame_metrics_head;
-  /*The frame count of each type (keyframes, delta frames, and dup frames);
-     32 bits limits us to 2.268 years at 60 fps.*/
-  ogg_uint32_t       frames_total[3];
-  /*The number of frames of each type yet to be processed.*/
-  ogg_uint32_t       frames_left[3];
-  /*The sum of the scale values for each frame type.*/
-  ogg_int64_t        scale_sum[2];
-  /*The start of the window over which the current scale sums are taken.*/
-  int                scale_window0;
-  /*The end of the window over which the current scale sums are taken.*/
-  int                scale_window_end;
-  /*The frame count of each type in the current 2-pass window; this does not
-     include dup frames.*/
-  int                nframes[3];
-  /*The total accumulated estimation bias.*/
-  ogg_int64_t        rate_bias;
-};
-
-
-void oc_rc_state_init(oc_rc_state *_rc,oc_enc_ctx *_enc);
-void oc_rc_state_clear(oc_rc_state *_rc);
-
-void oc_enc_rc_resize(oc_enc_ctx *_enc);
-int oc_enc_select_qi(oc_enc_ctx *_enc,int _qti,int _clamp);
-void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _frame_type);
-int oc_enc_update_rc_state(oc_enc_ctx *_enc,
- long _bits,int _qti,int _qi,int _trial,int _droppable);
-int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf);
-int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes);
-
-
-
-/*The internal encoder state.*/
-struct th_enc_ctx{
-  /*Shared encoder/decoder state.*/
-  oc_theora_state          state;
-  /*Buffer in which to assemble packets.*/
-  oggpack_buffer           opb;
-  /*Encoder-specific macroblock information.*/
-  oc_mb_enc_info          *mb_info;
-  /*DC coefficients after prediction.*/
-  ogg_int16_t             *frag_dc;
-  /*The list of coded macro blocks, in coded order.*/
-  unsigned                *coded_mbis;
-  /*The number of coded macro blocks.*/
-  size_t                   ncoded_mbis;
-  /*Whether or not packets are ready to be emitted.
-    This takes on negative values while there are remaining header packets to
-     be emitted, reaches 0 when the codec is ready for input, and becomes
-     positive when a frame has been processed and data packets are ready.*/
-  int                      packet_state;
-  /*The maximum distance between keyframes.*/
-  ogg_uint32_t             keyframe_frequency_force;
-  /*The number of duplicates to produce for the next frame.*/
-  ogg_uint32_t             dup_count;
-  /*The number of duplicates remaining to be emitted for the current frame.*/
-  ogg_uint32_t             nqueued_dups;
-  /*The number of duplicates emitted for the last frame.*/
-  ogg_uint32_t             prev_dup_count;
-  /*The current speed level.*/
-  int                      sp_level;
-  /*Whether or not VP3 compatibility mode has been enabled.*/
-  unsigned char            vp3_compatible;
-  /*Whether or not any INTER frames have been coded.*/
-  unsigned char            coded_inter_frame;
-  /*Whether or not previous frame was dropped.*/
-  unsigned char            prevframe_dropped;
-  /*Stores most recently chosen Huffman tables for each frame type, DC and AC
-     coefficients, and luma and chroma tokens.
-    The actual Huffman table used for a given coefficient depends not only on
-     the choice made here, but also its index in the zig-zag ordering.*/
-  unsigned char            huff_idxs[2][2][2];
-  /*Current count of bits used by each MV coding mode.*/
-  size_t                   mv_bits[2];
-  /*The mode scheme chooser for estimating mode coding costs.*/
-  oc_mode_scheme_chooser   chooser;
-  /*The number of vertical super blocks in an MCU.*/
-  int                      mcu_nvsbs;
-  /*The SSD error for skipping each fragment in the current MCU.*/
-  unsigned                *mcu_skip_ssd;
-  /*The DCT token lists for each coefficient and each plane.*/
-  unsigned char          **dct_tokens[3];
-  /*The extra bits associated with each DCT token.*/
-  ogg_uint16_t           **extra_bits[3];
-  /*The number of DCT tokens for each coefficient for each plane.*/
-  ptrdiff_t                ndct_tokens[3][64];
-  /*Pending EOB runs for each coefficient for each plane.*/
-  ogg_uint16_t             eob_run[3][64];
-  /*The offset of the first DCT token for each coefficient for each plane.*/
-  unsigned char            dct_token_offs[3][64];
-  /*The last DC coefficient for each plane and reference frame.*/
-  int                      dc_pred_last[3][3];
-#if defined(OC_COLLECT_METRICS)
-  /*Fragment SATD statistics for MB mode estimation metrics.*/
-  unsigned                *frag_satd;
-  /*Fragment SSD statistics for MB mode estimation metrics.*/
-  unsigned                *frag_ssd;
-#endif
-  /*The R-D optimization parameter.*/
-  int                      lambda;
-  /*The huffman tables in use.*/
-  th_huff_code             huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
-  /*The quantization parameters in use.*/
-  th_quant_info            qinfo;
-  oc_iquant               *enquant_tables[64][3][2];
-  oc_iquant_table          enquant_table_data[64][3][2];
-  /*An "average" quantizer for each quantizer type (INTRA or INTER) and qi
-     value.
-    This is used to paramterize the rate control decisions.
-    They are kept in the log domain to simplify later processing.
-    Keep in mind these are DCT domain quantizers, and so are scaled by an
-     additional factor of 4 from the pixel domain.*/
-  ogg_int64_t              log_qavg[2][64];
-  /*The buffer state used to drive rate control.*/
-  oc_rc_state              rc;
-  /*Table for encoder acceleration functions.*/
-  oc_enc_opt_vtable        opt_vtable;
-};
-
-
-void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode);
-int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode);
-#if defined(OC_COLLECT_METRICS)
-void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc);
-void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc);
-#endif
-
-
-
-/*Perform fullpel motion search for a single MB against both reference frames.*/
-void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi);
-/*Refine a MB MV for one frame.*/
-void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame);
-/*Refine the block MVs.*/
-void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi);
-
-
-
-/*Used to rollback a tokenlog transaction when we retroactively decide to skip
-   a fragment.
-  A checkpoint is taken right before each token is added.*/
-struct oc_token_checkpoint{
-  /*The color plane the token was added to.*/
-  unsigned char pli;
-  /*The zig-zag index the token was added to.*/
-  unsigned char zzi;
-  /*The outstanding EOB run count before the token was added.*/
-  ogg_uint16_t  eob_run;
-  /*The token count before the token was added.*/
-  ptrdiff_t     ndct_tokens;
-};
-
-
-
-void oc_enc_tokenize_start(oc_enc_ctx *_enc);
-int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
- int _zzi,oc_token_checkpoint **_stack,int _acmin);
-void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
- const oc_token_checkpoint *_stack,int _n);
-void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
- int _pli,int _fragy0,int _frag_yend);
-void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
- const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
- int _prev_ndct_tokens1,int _prev_eob_run1);
-void oc_enc_tokenize_finish(oc_enc_ctx *_enc);
-
-
-
-/*Utility routine to encode one of the header packets.*/
-int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
- oggpack_buffer *_opb,const th_quant_info *_qinfo,
- const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS],
- const char *_vendor,th_comment *_tc,ogg_packet *_op);
-
-
-
-/*Encoder-specific accelerated functions.*/
-void oc_enc_frag_sub(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
- const unsigned char *_src,const unsigned char *_ref,int _ystride);
-void oc_enc_frag_sub_128(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
- const unsigned char *_src,int _ystride);
-unsigned oc_enc_frag_sad(const oc_enc_ctx *_enc,const unsigned char *_src,
- const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_sad_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_sad2_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref1,
- const unsigned char *_ref2,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_satd_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_satd2_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref1,
- const unsigned char *_ref2,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc,
- const unsigned char *_src,int _ystride);
-void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
- unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]);
-void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
- const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
-void oc_enc_fdct8x8(const oc_enc_ctx *_enc,ogg_int16_t _y[64],
- const ogg_int16_t _x[64]);
-
-/*Default pure-C implementations.*/
-void oc_enc_vtable_init_c(oc_enc_ctx *_enc);
-
-void oc_enc_frag_sub_c(ogg_int16_t _diff[64],
- const unsigned char *_src,const unsigned char *_ref,int _ystride);
-void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
- const unsigned char *_src,int _ystride);
-void oc_enc_frag_copy2_c(unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-unsigned oc_enc_frag_sad_c(const unsigned char *_src,
- const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_satd2_thresh_c(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride);
-void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-
-#endif
--- a/media/libtheora/lib/encoder_disabled.c
+++ b/media/libtheora/lib/encoder_disabled.c
@ -1,67 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: encoder_disabled.c 16503 2009-08-22 18:14:02Z giles $
-
- ********************************************************************/
-#include "apiwrapper.h"
-#include "encint.h"
-
-th_enc_ctx *th_encode_alloc(const th_info *_info){
-  return NULL;
-}
-
-void th_encode_free(th_enc_ctx *_enc){}
-
-
-int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
-  return OC_DISABLED;
-}
-
-int th_encode_flushheader(th_enc_ctx *_enc,th_comment *_tc,ogg_packet *_op){
-  return OC_DISABLED;
-}
-
-int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
-  return OC_DISABLED;
-}
-
-int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
-  return OC_DISABLED;
-}
-
-
-
-int theora_encode_init(theora_state *_te,theora_info *_ci){
-  return OC_DISABLED;
-}
-
-int theora_encode_YUVin(theora_state *_te,yuv_buffer *_yuv){
-  return OC_DISABLED;
-}
-
-int theora_encode_packetout(theora_state *_te,int _last_p,ogg_packet *_op){
-  return OC_DISABLED;
-}
-
-int theora_encode_header(theora_state *_te,ogg_packet *_op){
-  return OC_DISABLED;
-}
-
-int theora_encode_comment(theora_comment *_tc,ogg_packet *_op){
-  return OC_DISABLED;
-}
-
-int theora_encode_tables(theora_state *_te,ogg_packet *_op){
-  return OC_DISABLED;
-}
--- a/media/libtheora/lib/enquant.h
+++ b/media/libtheora/lib/enquant.h
@ -1,27 +0,0 @@
-#if !defined(_enquant_H)
-# define _enquant_H (1)
-# include "quant.h"
-
-typedef struct oc_iquant oc_iquant;
-
-#define OC_QUANT_MAX_LOG (OC_Q57(OC_STATIC_ILOG_32(OC_QUANT_MAX)-1))
-
-/*Used to compute x/d via ((x*m>>16)+x>>l)+(x<0))
-   (i.e., one 16x16->16 mul, 2 shifts, and 2 adds).
-  This is not an approximation; for 16-bit x and d, it is exact.*/
-struct oc_iquant{
-  ogg_int16_t m;
-  ogg_int16_t l;
-};
-
-typedef oc_iquant        oc_iquant_table[64];
-
-
-
-void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo);
-void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2],
- oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo);
-void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
- ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt);
-
-#endif
--- a/media/libtheora/lib/fragment.c
+++ b/media/libtheora/lib/fragment.c
@ -11,17 +11,12 @@
 ********************************************************************

  function:
-    last mod: $Id: fragment.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: fragment.c 17410 2010-09-21 21:53:48Z tterribe $

 ********************************************************************/
 #include <string.h>
 #include "internal.h"

-void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst,
- const unsigned char *_src,int _ystride){
-  (*_state->opt_vtable.frag_copy)(_dst,_src,_ystride);
-}
-
 void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){
  int i;
  for(i=8;i-->0;){
@ -31,9 +26,24 @@ void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){
  }
 }

-void oc_frag_recon_intra(const oc_theora_state *_state,unsigned char *_dst,
- int _ystride,const ogg_int16_t _residue[64]){
-  _state->opt_vtable.frag_recon_intra(_dst,_ystride,_residue);
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _dst_frame:     The reference frame to copy to.
+  _src_frame:     The reference frame to copy from.
+  _ystride:       The row stride of the reference frames.
+  _fragis:        A pointer to a list of fragment indices.
+  _nfragis:       The number of fragment indices to copy.
+  _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_c(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+    oc_frag_copy_c(_dst_frame+frag_buf_off,
+     _src_frame+frag_buf_off,_ystride);
+  }
 }

 void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
@ -46,11 +56,6 @@ void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
  }
 }

-void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
- const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
-  _state->opt_vtable.frag_recon_inter(_dst,_src,_ystride,_residue);
-}
-
 void oc_frag_recon_inter_c(unsigned char *_dst,
 const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
  int i;
@ -62,12 +67,6 @@ void oc_frag_recon_inter_c(unsigned char *_dst,
  }
 }

-void oc_frag_recon_inter2(const oc_theora_state *_state,unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride,
- const ogg_int16_t _residue[64]){
-  _state->opt_vtable.frag_recon_inter2(_dst,_src1,_src2,_ystride,_residue);
-}
-
 void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
 const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]){
  int i;
@ -80,8 +79,4 @@ void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
  }
 }

-void oc_restore_fpu(const oc_theora_state *_state){
-  _state->opt_vtable.restore_fpu();
-}
-
 void oc_restore_fpu_c(void){}
--- a/media/libtheora/lib/huffdec.c
+++ b/media/libtheora/lib/huffdec.c
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: huffdec.c 16702 2009-11-15 00:40:55Z tterribe $
+    last mod: $Id: huffdec.c 17577 2010-10-29 04:00:07Z tterribe $

 ********************************************************************/

@ -22,14 +22,60 @@
 #include "decint.h"


-/*The ANSI offsetof macro is broken on some platforms (e.g., older DECs).*/
-#define _ogg_offsetof(_type,_field)\
- ((size_t)((char *)&((_type *)0)->_field-(char *)0))

-/*The number of internal tokens associated with each of the spec tokens.*/
-static const unsigned char OC_DCT_TOKEN_MAP_ENTRIES[TH_NDCT_TOKENS]={
-  1,1,1,4,8,1,1,8,1,1,1,1,1,2,2,2,2,4,8,2,2,2,4,2,2,2,2,2,8,2,4,8
-};
+/*Instead of storing every branching in the tree, subtrees can be collapsed
+   into one node, with a table of size 1<<nbits pointing directly to its
+   descedents nbits levels down.
+  This allows more than one bit to be read at a time, and avoids following all
+   the intermediate branches with next to no increased code complexity once
+   the collapsed tree has been built.
+  We do _not_ require that a subtree be complete to be collapsed, but instead
+   store duplicate pointers in the table, and record the actual depth of the
+   node below its parent.
+  This tells us the number of bits to advance the stream after reaching it.
+
+  This turns out to be equivalent to the method described in \cite{Hash95},
+   without the requirement that codewords be sorted by length.
+  If the codewords were sorted by length (so-called ``canonical-codes''), they
+   could be decoded much faster via either Lindell and Moffat's approach or
+   Hashemian's Condensed Huffman Code approach, the latter of which has an
+   extremely small memory footprint.
+  We can't use Choueka et al.'s finite state machine approach, which is
+   extremely fast, because we can't allow multiple symbols to be output at a
+   time; the codebook can and does change between symbols.
+  It also has very large memory requirements, which impairs cache coherency.
+
+  We store the tree packed in an array of 16-bit integers (words).
+  Each node consists of a single word, followed consecutively by two or more
+   indices of its children.
+  Let n be the value of this first word.
+  This is the number of bits that need to be read to traverse the node, and
+   must be positive.
+  1<<n entries follow in the array, each an index to a child node.
+  If the child is positive, then it is the index of another internal node in
+   the table.
+  If the child is negative or zero, then it is a leaf node.
+  These are stored directly in the child pointer to save space, since they only
+   require a single word.
+  If a leaf node would have been encountered before reading n bits, then it is
+   duplicated the necessary number of times in this table.
+  Leaf nodes pack both a token value and their actual depth in the tree.
+  The token in the leaf node is (-leaf&255).
+  The number of bits that need to be consumed to reach the leaf, starting from
+   the current node, is (-leaf>>8).
+
+  @ARTICLE{Hash95,
+    author="Reza Hashemian",
+    title="Memory Efficient and High-Speed Search {Huffman} Coding",
+    journal="{IEEE} Transactions on Communications",
+    volume=43,
+    number=10,
+    pages="2576--2581",
+    month=Oct,
+    year=1995
+  }*/
+
+

 /*The map from external spec-defined tokens to internal tokens.
  This is constructed so that any extra bits read with the original token value
@ -99,391 +145,371 @@ static const unsigned char OC_DCT_TOKEN_MAP[TH_NDCT_TOKENS]={
  40
 };

-/*These three functions are really part of the bitpack.c module, but
-   they are only used here.
-  Declaring local static versions so they can be inlined saves considerable
-   function call overhead.*/
-
-static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
-  const unsigned char *ptr;
-  const unsigned char *stop;
-  oc_pb_window         window;
-  int                  available;
-  window=_b->window;
-  available=_b->bits;
-  ptr=_b->ptr;
-  stop=_b->stop;
-  /*This version of _refill() doesn't bother setting eof because we won't
-     check for it after we've started decoding DCT tokens.*/
-  if(ptr>=stop)available=OC_LOTS_OF_BITS;
-  while(available<=OC_PB_WINDOW_SIZE-8){
-    available+=8;
-    window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
-    if(ptr>=stop)available=OC_LOTS_OF_BITS;
-  }
-  _b->ptr=ptr;
-  if(_bits>available)window|=*ptr>>(available&7);
-  _b->bits=available;
-  return window;
-}
+/*The log base 2 of number of internal tokens associated with each of the spec
+   tokens (i.e., how many of the extra bits are folded into the token value).
+  Increasing the maximum value beyond 3 will enlarge the amount of stack
+   required for tree construction.*/
+static const unsigned char OC_DCT_TOKEN_MAP_LOG_NENTRIES[TH_NDCT_TOKENS]={
+  0,0,0,2,3,0,0,3,0,0,0,0,0,1,1,1,1,2,3,1,1,1,2,1,1,1,1,1,3,1,2,3
+};


-/*Read in bits without advancing the bit pointer.
-  Here we assume 0<=_bits&&_bits<=32.*/
-static long oc_pack_look(oc_pack_buf *_b,int _bits){
-  oc_pb_window window;
-  int          available;
-  long         result;
-  window=_b->window;
-  available=_b->bits;
-  if(_bits==0)return 0;
-  if(_bits>available)_b->window=window=oc_pack_refill(_b,_bits);
-  result=window>>OC_PB_WINDOW_SIZE-_bits;
-  return result;
-}
-
-/*Advance the bit pointer.*/
-static void oc_pack_adv(oc_pack_buf *_b,int _bits){
-  /*We ignore the special cases for _bits==0 and _bits==32 here, since they are
-     never used actually used.
-    OC_HUFF_SLUSH (defined below) would have to be at least 27 to actually read
-     32 bits in a single go, and would require a 32 GB lookup table (assuming
-     8 byte pointers, since 4 byte pointers couldn't fit such a table).*/
-  _b->window<<=_bits;
-  _b->bits-=_bits;
-}
-
-
-/*The log_2 of the size of a lookup table is allowed to grow to relative to
-   the number of unique nodes it contains.
-  E.g., if OC_HUFF_SLUSH is 2, then at most 75% of the space in the tree is
-   wasted (each node will have an amortized cost of at most 20 bytes when using
-   4-byte pointers).
+/*The size a lookup table is allowed to grow to relative to the number of
+   unique nodes it contains.
+  E.g., if OC_HUFF_SLUSH is 4, then at most 75% of the space in the tree is
+   wasted (1/4 of the space must be used).
  Larger numbers can decode tokens with fewer read operations, while smaller
-   numbers may save more space (requiring as little as 8 bytes amortized per
-   node, though there will be more nodes).
+   numbers may save more space.
  With a sample file:
  32233473 read calls are required when no tree collapsing is done (100.0%).
-  19269269 read calls are required when OC_HUFF_SLUSH is 0 (59.8%).
-  11144969 read calls are required when OC_HUFF_SLUSH is 1 (34.6%).
-  10538563 read calls are required when OC_HUFF_SLUSH is 2 (32.7%).
-  10192578 read calls are required when OC_HUFF_SLUSH is 3 (31.6%).
-  Since a value of 1 gets us the vast majority of the speed-up with only a
-   small amount of wasted memory, this is what we use.*/
-#define OC_HUFF_SLUSH (1)
+  19269269 read calls are required when OC_HUFF_SLUSH is 1 (59.8%).
+  11144969 read calls are required when OC_HUFF_SLUSH is 2 (34.6%).
+  10538563 read calls are required when OC_HUFF_SLUSH is 4 (32.7%).
+  10192578 read calls are required when OC_HUFF_SLUSH is 8 (31.6%).
+  Since a value of 2 gets us the vast majority of the speed-up with only a
+   small amount of wasted memory, this is what we use.
+  This value must be less than 128, or you could create a tree with more than
+   32767 entries, which would overflow the 16-bit words used to index it.*/
+#define OC_HUFF_SLUSH (2)
+/*The root of the tree is on the fast path, and a larger value here is more
+   beneficial than elsewhere in the tree.
+  7 appears to give the best performance, trading off between increased use of
+   the single-read fast path and cache footprint for the tables, though
+   obviously this will depend on your cache size.
+  Using 7 here, the VP3 tables are about twice as large compared to using 2.*/
+#define OC_ROOT_HUFF_SLUSH (7)


-/*Determines the size in bytes of a Huffman tree node that represents a
+
+/*Unpacks a Huffman codebook.
+  _opb:    The buffer to unpack from.
+  _tokens: Stores a list of internal tokens, in the order they were found in
+            the codebook, and the lengths of their corresponding codewords.
+           This is enough to completely define the codebook, while minimizing
+            stack usage and avoiding temporary allocations (for platforms
+            where free() is a no-op).
+  Return: The number of internal tokens in the codebook, or a negative value
+   on error.*/
+int oc_huff_tree_unpack(oc_pack_buf *_opb,unsigned char _tokens[256][2]){
+  ogg_uint32_t code;
+  int          len;
+  int          ntokens;
+  int          nleaves;
+  code=0;
+  len=ntokens=nleaves=0;
+  for(;;){
+    long bits;
+    bits=oc_pack_read1(_opb);
+    /*Only process nodes so long as there's more bits in the buffer.*/
+    if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
+    /*Read an internal node:*/
+    if(!bits){
+      len++;
+      /*Don't allow codewords longer than 32 bits.*/
+      if(len>32)return TH_EBADHEADER;
+    }
+    /*Read a leaf node:*/
+    else{
+      ogg_uint32_t code_bit;
+      int          neb;
+      int          nentries;
+      int          token;
+      /*Don't allow more than 32 spec-tokens per codebook.*/
+      if(++nleaves>32)return TH_EBADHEADER;
+      bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
+      neb=OC_DCT_TOKEN_MAP_LOG_NENTRIES[bits];
+      token=OC_DCT_TOKEN_MAP[bits];
+      nentries=1<<neb;
+      while(nentries-->0){
+        _tokens[ntokens][0]=(unsigned char)token++;
+        _tokens[ntokens][1]=(unsigned char)(len+neb);
+        ntokens++;
+      }
+      code_bit=0x80000000U>>len-1;
+      while(len>0&&(code&code_bit)){
+        code^=code_bit;
+        code_bit<<=1;
+        len--;
+      }
+      if(len<=0)break;
+      code|=code_bit;
+    }
+  }
+  return ntokens;
+}
+
+/*Count how many tokens would be required to fill a subtree at depth _depth.
+  _tokens: A list of internal tokens, in the order they are found in the
+            codebook, and the lengths of their corresponding codewords.
+  _depth:  The depth of the desired node in the corresponding tree structure.
+  Return: The number of tokens that belong to that subtree.*/
+static int oc_huff_subtree_tokens(unsigned char _tokens[][2],int _depth){
+  ogg_uint32_t code;
+  int          ti;
+  code=0;
+  ti=0;
+  do{
+    if(_tokens[ti][1]-_depth<32)code+=0x80000000U>>_tokens[ti++][1]-_depth;
+    else{
+      /*Because of the expanded internal tokens, we can have codewords as long
+         as 35 bits.
+        A single recursion here is enough to advance past them.*/
+      code++;
+      ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+31);
+    }
+  }
+  while(code<0x80000000U);
+  return ti;
+}
+
+/*Compute the number of bits to use for a collapsed tree node at the given
+   depth.
+  _tokens:  A list of internal tokens, in the order they are found in the
+             codebook, and the lengths of their corresponding codewords.
+  _ntokens: The number of tokens corresponding to this tree node.
+  _depth:   The depth of this tree node.
+  Return: The number of bits to use for a collapsed tree node rooted here.
+          This is always at least one, even if this was a leaf node.*/
+static int oc_huff_tree_collapse_depth(unsigned char _tokens[][2],
+ int _ntokens,int _depth){
+  int got_leaves;
+  int loccupancy;
+  int occupancy;
+  int slush;
+  int nbits;
+  int best_nbits;
+  slush=_depth>0?OC_HUFF_SLUSH:OC_ROOT_HUFF_SLUSH;
+  /*It's legal to have a tree with just a single node, which requires no bits
+     to decode and always returns the same token.
+    However, no encoder actually does this (yet).
+    To avoid a special case in oc_huff_token_decode(), we force the number of
+     lookahead bits to be at least one.
+    This will produce a tree that looks ahead one bit and then advances the
+     stream zero bits.*/
+  nbits=1;
+  occupancy=2;
+  got_leaves=1;
+  do{
+    int ti;
+    if(got_leaves)best_nbits=nbits;
+    nbits++;
+    got_leaves=0;
+    loccupancy=occupancy;
+    for(occupancy=ti=0;ti<_ntokens;occupancy++){
+      if(_tokens[ti][1]<_depth+nbits)ti++;
+      else if(_tokens[ti][1]==_depth+nbits){
+        got_leaves=1;
+        ti++;
+      }
+      else ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+nbits);
+    }
+  }
+  while(occupancy>loccupancy&&occupancy*slush>=1<<nbits);
+  return best_nbits;
+}
+
+/*Determines the size in words of a Huffman tree node that represents a
   subtree of depth _nbits.
  _nbits: The depth of the subtree.
-          If this is 0, the node is a leaf node.
-          Otherwise 1<<_nbits pointers are allocated for children.
-  Return: The number of bytes required to store the node.*/
+          This must be greater than zero.
+  Return: The number of words required to store the node.*/
 static size_t oc_huff_node_size(int _nbits){
-  size_t size;
-  size=_ogg_offsetof(oc_huff_node,nodes);
-  if(_nbits>0)size+=sizeof(oc_huff_node *)*(1<<_nbits);
-  return size;
+  return 1+(1<<_nbits);
 }

-static oc_huff_node *oc_huff_node_init(char **_storage,size_t _size,int _nbits){
-  oc_huff_node *ret;
-  ret=(oc_huff_node *)*_storage;
-  ret->nbits=(unsigned char)_nbits;
-  (*_storage)+=_size;
-  return ret;
-}
-
-
-/*Determines the size in bytes of a Huffman tree.
-  _nbits: The depth of the subtree.
-          If this is 0, the node is a leaf node.
-          Otherwise storage for 1<<_nbits pointers are added for children.
-  Return: The number of bytes required to store the tree.*/
-static size_t oc_huff_tree_size(const oc_huff_node *_node){
-  size_t size;
-  size=oc_huff_node_size(_node->nbits);
-  if(_node->nbits){
-    int nchildren;
-    int i;
-    nchildren=1<<_node->nbits;
-    for(i=0;i<nchildren;i+=1<<_node->nbits-_node->nodes[i]->depth){
-      size+=oc_huff_tree_size(_node->nodes[i]);
-    }
-  }
-  return size;
-}
-
-
-/*Unpacks a sub-tree from the given buffer.
-  _opb:      The buffer to unpack from.
-  _binodes:  The nodes to store the sub-tree in.
-  _nbinodes: The number of nodes available for the sub-tree.
-  Return: 0 on success, or a negative value on error.*/
-static int oc_huff_tree_unpack(oc_pack_buf *_opb,
- oc_huff_node *_binodes,int _nbinodes){
-  oc_huff_node *binode;
-  long          bits;
-  int           nused;
-  if(_nbinodes<1)return TH_EBADHEADER;
-  binode=_binodes;
-  nused=0;
-  bits=oc_pack_read1(_opb);
-  if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
-  /*Read an internal node:*/
-  if(!bits){
-    int ret;
-    nused++;
-    binode->nbits=1;
-    binode->depth=1;
-    binode->nodes[0]=_binodes+nused;
-    ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
-    if(ret>=0){
-      nused+=ret;
-      binode->nodes[1]=_binodes+nused;
-      ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
-    }
-    if(ret<0)return ret;
-    nused+=ret;
-  }
-  /*Read a leaf node:*/
-  else{
-    int ntokens;
-    int token;
-    int i;
-    bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
-    if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
-    /*Find out how many internal tokens we translate this external token into.*/
-    ntokens=OC_DCT_TOKEN_MAP_ENTRIES[bits];
-    if(_nbinodes<2*ntokens-1)return TH_EBADHEADER;
-    /*Fill in a complete binary tree pointing to the internal tokens.*/
-    for(i=1;i<ntokens;i<<=1){
-      int j;
-      binode=_binodes+nused;
-      nused+=i;
-      for(j=0;j<i;j++){
-        binode[j].nbits=1;
-        binode[j].depth=1;
-        binode[j].nodes[0]=_binodes+nused+2*j;
-        binode[j].nodes[1]=_binodes+nused+2*j+1;
+/*Produces a collapsed-tree representation of the given token list.
+  _tree: The storage for the collapsed Huffman tree.
+         This may be NULL to compute the required storage size instead of
+          constructing the tree.
+  _tokens:  A list of internal tokens, in the order they are found in the
+             codebook, and the lengths of their corresponding codewords.
+  _ntokens: The number of tokens corresponding to this tree node.
+  Return: The number of words required to store the tree.*/
+static size_t oc_huff_tree_collapse(ogg_int16_t *_tree,
+ unsigned char _tokens[][2],int _ntokens){
+  ogg_int16_t   node[34];
+  unsigned char depth[34];
+  unsigned char last[34];
+  size_t        ntree;
+  int           ti;
+  int           l;
+  depth[0]=0;
+  last[0]=(unsigned char)(_ntokens-1);
+  ntree=0;
+  ti=0;
+  l=0;
+  do{
+    int nbits;
+    nbits=oc_huff_tree_collapse_depth(_tokens+ti,last[l]+1-ti,depth[l]);
+    node[l]=(ogg_int16_t)ntree;
+    ntree+=oc_huff_node_size(nbits);
+    if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)nbits;
+    do{
+      while(ti<=last[l]&&_tokens[ti][1]<=depth[l]+nbits){
+        if(_tree!=NULL){
+          ogg_int16_t leaf;
+          int         nentries;
+          nentries=1<<depth[l]+nbits-_tokens[ti][1];
+          leaf=(ogg_int16_t)-(_tokens[ti][1]-depth[l]<<8|_tokens[ti][0]);
+          while(nentries-->0)_tree[node[l]++]=leaf;
+        }
+        ti++;
      }
+      if(ti<=last[l]){
+        /*We need to recurse*/
+        depth[l+1]=(unsigned char)(depth[l]+nbits);
+        if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)ntree;
+        l++;
+        last[l]=
+         (unsigned char)(ti+oc_huff_subtree_tokens(_tokens+ti,depth[l])-1);
+        break;
+      }
+      /*Pop back up a level of recursion.*/
+      else if(l-->0)nbits=depth[l+1]-depth[l];
    }
-    /*And now the leaf nodes with those tokens.*/
-    token=OC_DCT_TOKEN_MAP[bits];
-    for(i=0;i<ntokens;i++){
-      binode=_binodes+nused++;
-      binode->nbits=0;
-      binode->depth=1;
-      binode->token=token+i;
-    }
+    while(l>=0);
  }
-  return nused;
-}
-
-/*Finds the depth of shortest branch of the given sub-tree.
-  The tree must be binary.
-  _binode: The root of the given sub-tree.
-           _binode->nbits must be 0 or 1.
-  Return: The smallest depth of a leaf node in this sub-tree.
-          0 indicates this sub-tree is a leaf node.*/
-static int oc_huff_tree_mindepth(oc_huff_node *_binode){
-  int depth0;
-  int depth1;
-  if(_binode->nbits==0)return 0;
-  depth0=oc_huff_tree_mindepth(_binode->nodes[0]);
-  depth1=oc_huff_tree_mindepth(_binode->nodes[1]);
-  return OC_MINI(depth0,depth1)+1;
-}
-
-/*Finds the number of internal nodes at a given depth, plus the number of
-   leaves at that depth or shallower.
-  The tree must be binary.
-  _binode: The root of the given sub-tree.
-           _binode->nbits must be 0 or 1.
-  Return: The number of entries that would be contained in a jump table of the
-           given depth.*/
-static int oc_huff_tree_occupancy(oc_huff_node *_binode,int _depth){
-  if(_binode->nbits==0||_depth<=0)return 1;
-  else{
-    return oc_huff_tree_occupancy(_binode->nodes[0],_depth-1)+
-     oc_huff_tree_occupancy(_binode->nodes[1],_depth-1);
-  }
-}
-
-/*Makes a copy of the given Huffman tree.
-  _node: The Huffman tree to copy.
-  Return: The copy of the Huffman tree.*/
-static oc_huff_node *oc_huff_tree_copy(const oc_huff_node *_node,
- char **_storage){
-  oc_huff_node *ret;
-  ret=oc_huff_node_init(_storage,oc_huff_node_size(_node->nbits),_node->nbits);
-  ret->depth=_node->depth;
-  if(_node->nbits){
-    int nchildren;
-    int i;
-    int inext;
-    nchildren=1<<_node->nbits;
-    for(i=0;i<nchildren;){
-      ret->nodes[i]=oc_huff_tree_copy(_node->nodes[i],_storage);
-      inext=i+(1<<_node->nbits-ret->nodes[i]->depth);
-      while(++i<inext)ret->nodes[i]=ret->nodes[i-1];
-    }
-  }
-  else ret->token=_node->token;
-  return ret;
-}
-
-static size_t oc_huff_tree_collapse_size(oc_huff_node *_binode,int _depth){
-  size_t size;
-  int    mindepth;
-  int    depth;
-  int    loccupancy;
-  int    occupancy;
-  if(_binode->nbits!=0&&_depth>0){
-    return oc_huff_tree_collapse_size(_binode->nodes[0],_depth-1)+
-     oc_huff_tree_collapse_size(_binode->nodes[1],_depth-1);
-  }
-  depth=mindepth=oc_huff_tree_mindepth(_binode);
-  occupancy=1<<mindepth;
-  do{
-    loccupancy=occupancy;
-    occupancy=oc_huff_tree_occupancy(_binode,++depth);
-  }
-  while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
-  depth--;
-  size=oc_huff_node_size(depth);
-  if(depth>0){
-    size+=oc_huff_tree_collapse_size(_binode->nodes[0],depth-1);
-    size+=oc_huff_tree_collapse_size(_binode->nodes[1],depth-1);
-  }
-  return size;
-}
-
-static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
- char **_storage);
-
-/*Fills the given nodes table with all the children in the sub-tree at the
-   given depth.
-  The nodes in the sub-tree with a depth less than that stored in the table
-   are freed.
-  The sub-tree must be binary and complete up until the given depth.
-  _nodes:  The nodes table to fill.
-  _binode: The root of the sub-tree to fill it with.
-           _binode->nbits must be 0 or 1.
-  _level:  The current level in the table.
-           0 indicates that the current node should be stored, regardless of
-            whether it is a leaf node or an internal node.
-  _depth:  The depth of the nodes to fill the table with, relative to their
-            parent.*/
-static void oc_huff_node_fill(oc_huff_node **_nodes,
- oc_huff_node *_binode,int _level,int _depth,char **_storage){
-  if(_level<=0||_binode->nbits==0){
-    int i;
-    _binode->depth=(unsigned char)(_depth-_level);
-    _nodes[0]=oc_huff_tree_collapse(_binode,_storage);
-    for(i=1;i<1<<_level;i++)_nodes[i]=_nodes[0];
-  }
-  else{
-    _level--;
-    oc_huff_node_fill(_nodes,_binode->nodes[0],_level,_depth,_storage);
-    _nodes+=1<<_level;
-    oc_huff_node_fill(_nodes,_binode->nodes[1],_level,_depth,_storage);
-  }
-}
-
-/*Finds the largest complete sub-tree rooted at the current node and collapses
-   it into a single node.
-  This procedure is then applied recursively to all the children of that node.
-  _binode: The root of the sub-tree to collapse.
-           _binode->nbits must be 0 or 1.
-  Return: The new root of the collapsed sub-tree.*/
-static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
- char **_storage){
-  oc_huff_node *root;
-  size_t        size;
-  int           mindepth;
-  int           depth;
-  int           loccupancy;
-  int           occupancy;
-  depth=mindepth=oc_huff_tree_mindepth(_binode);
-  occupancy=1<<mindepth;
-  do{
-    loccupancy=occupancy;
-    occupancy=oc_huff_tree_occupancy(_binode,++depth);
-  }
-  while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
-  depth--;
-  if(depth<=0)return oc_huff_tree_copy(_binode,_storage);
-  size=oc_huff_node_size(depth);
-  root=oc_huff_node_init(_storage,size,depth);
-  root->depth=_binode->depth;
-  oc_huff_node_fill(root->nodes,_binode,depth,depth,_storage);
-  return root;
+  while(l>=0);
+  return ntree;
 }

 /*Unpacks a set of Huffman trees, and reduces them to a collapsed
   representation.
  _opb:   The buffer to unpack the trees from.
  _nodes: The table to fill with the Huffman trees.
-  Return: 0 on success, or a negative value on error.*/
+  Return: 0 on success, or a negative value on error.
+          The caller is responsible for cleaning up any partially initialized
+           _nodes on failure.*/
 int oc_huff_trees_unpack(oc_pack_buf *_opb,
- oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
+ ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
  int i;
  for(i=0;i<TH_NHUFFMAN_TABLES;i++){
-    oc_huff_node  nodes[511];
-    char         *storage;
-    size_t        size;
-    int           ret;
+    unsigned char  tokens[256][2];
+    int            ntokens;
+    ogg_int16_t   *tree;
+    size_t         size;
    /*Unpack the full tree into a temporary buffer.*/
-    ret=oc_huff_tree_unpack(_opb,nodes,sizeof(nodes)/sizeof(*nodes));
-    if(ret<0)return ret;
-    /*Figure out how big the collapsed tree will be.*/
-    size=oc_huff_tree_collapse_size(nodes,0);
-    storage=(char *)_ogg_calloc(1,size);
-    if(storage==NULL)return TH_EFAULT;
-    /*And collapse it.*/
-    _nodes[i]=oc_huff_tree_collapse(nodes,&storage);
+    ntokens=oc_huff_tree_unpack(_opb,tokens);
+    if(ntokens<0)return ntokens;
+    /*Figure out how big the collapsed tree will be and allocate space for it.*/
+    size=oc_huff_tree_collapse(NULL,tokens,ntokens);
+    /*This should never happen; if it does it means you set OC_HUFF_SLUSH or
+       OC_ROOT_HUFF_SLUSH too large.*/
+    if(size>32767)return TH_EIMPL;
+    tree=(ogg_int16_t *)_ogg_malloc(size*sizeof(*tree));
+    if(tree==NULL)return TH_EFAULT;
+    /*Construct the collapsed the tree.*/
+    oc_huff_tree_collapse(tree,tokens,ntokens);
+    _nodes[i]=tree;
  }
  return 0;
 }

+/*Determines the size in words of a Huffman subtree.
+  _tree: The complete Huffman tree.
+  _node: The index of the root of the desired subtree.
+  Return: The number of words required to store the tree.*/
+static size_t oc_huff_tree_size(const ogg_int16_t *_tree,int _node){
+  size_t size;
+  int    nchildren;
+  int    n;
+  int    i;
+  n=_tree[_node];
+  size=oc_huff_node_size(n);
+  nchildren=1<<n;
+  i=0;
+  do{
+    int child;
+    child=_tree[_node+i+1];
+    if(child<=0)i+=1<<n-(-child>>8);
+    else{
+      size+=oc_huff_tree_size(_tree,child);
+      i++;
+    }
+  }
+  while(i<nchildren);
+  return size;
+}
+
 /*Makes a copy of the given set of Huffman trees.
  _dst: The array to store the copy in.
  _src: The array of trees to copy.*/
-int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
- const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]){
+int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
+ const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]){
+  int total;
  int i;
+  total=0;
  for(i=0;i<TH_NHUFFMAN_TABLES;i++){
-    size_t  size;
-    char   *storage;
-    size=oc_huff_tree_size(_src[i]);
-    storage=(char *)_ogg_calloc(1,size);
-    if(storage==NULL){
+    size_t size;
+    size=oc_huff_tree_size(_src[i],0);
+    total+=size;
+    _dst[i]=(ogg_int16_t *)_ogg_malloc(size*sizeof(*_dst[i]));
+    if(_dst[i]==NULL){
      while(i-->0)_ogg_free(_dst[i]);
      return TH_EFAULT;
    }
-    _dst[i]=oc_huff_tree_copy(_src[i],&storage);
+    memcpy(_dst[i],_src[i],size*sizeof(*_dst[i]));
  }
  return 0;
 }

 /*Frees the memory used by a set of Huffman trees.
  _nodes: The array of trees to free.*/
-void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
+void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
  int i;
  for(i=0;i<TH_NHUFFMAN_TABLES;i++)_ogg_free(_nodes[i]);
 }

+
 /*Unpacks a single token using the given Huffman tree.
  _opb:  The buffer to unpack the token from.
  _node: The tree to unpack the token with.
  Return: The token value.*/
-int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node){
-  long bits;
-  while(_node->nbits!=0){
-    bits=oc_pack_look(_opb,_node->nbits);
-    _node=_node->nodes[bits];
-    oc_pack_adv(_opb,_node->depth);
+int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_tree){
+  const unsigned char *ptr;
+  const unsigned char *stop;
+  oc_pb_window         window;
+  int                  available;
+  long                 bits;
+  int                  node;
+  int                  n;
+  ptr=_opb->ptr;
+  window=_opb->window;
+  stop=_opb->stop;
+  available=_opb->bits;
+  node=0;
+  for(;;){
+    n=_tree[node];
+    if(n>available){
+      unsigned shift;
+      shift=OC_PB_WINDOW_SIZE-available;
+      do{
+        /*We don't bother setting eof because we won't check for it after we've
+           started decoding DCT tokens.*/
+        if(ptr>=stop){
+          shift=(unsigned)-OC_LOTS_OF_BITS;
+          break;
+        }
+        shift-=8;
+        window|=(oc_pb_window)*ptr++<<shift;
+      }
+      while(shift>=8);
+      /*Note: We never request more than 24 bits, so there's no need to fill in
+         the last partial byte here.*/
+      available=OC_PB_WINDOW_SIZE-shift;
+    }
+    bits=window>>OC_PB_WINDOW_SIZE-n;
+    node=_tree[node+1+bits];
+    if(node<=0)break;
+    window<<=n;
+    available-=n;
  }
-  return _node->token;
+  node=-node;
+  n=node>>8;
+  window<<=n;
+  available-=n;
+  _opb->ptr=ptr;
+  _opb->window=window;
+  _opb->bits=available;
+  return node&255;
 }
--- a/media/libtheora/lib/huffdec.h
+++ b/media/libtheora/lib/huffdec.h
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: huffdec.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: huffdec.h 17410 2010-09-21 21:53:48Z tterribe $

 ********************************************************************/

@ -22,71 +22,11 @@



-typedef struct oc_huff_node oc_huff_node;
-
-/*A node in the Huffman tree.
-  Instead of storing every branching in the tree, subtrees can be collapsed
-   into one node, with a table of size 1<<nbits pointing directly to its
-   descedents nbits levels down.
-  This allows more than one bit to be read at a time, and avoids following all
-   the intermediate branches with next to no increased code complexity once
-   the collapsed tree has been built.
-  We do _not_ require that a subtree be complete to be collapsed, but instead
-   store duplicate pointers in the table, and record the actual depth of the
-   node below its parent.
-  This tells us the number of bits to advance the stream after reaching it.
-
-  This turns out to be equivalent to the method described in \cite{Hash95},
-   without the requirement that codewords be sorted by length.
-  If the codewords were sorted by length (so-called ``canonical-codes''), they
-   could be decoded much faster via either Lindell and Moffat's approach or
-   Hashemian's Condensed Huffman Code approach, the latter of which has an
-   extremely small memory footprint.
-  We can't use Choueka et al.'s finite state machine approach, which is
-   extremely fast, because we can't allow multiple symbols to be output at a
-   time; the codebook can and does change between symbols.
-  It also has very large memory requirements, which impairs cache coherency.
-
-  @ARTICLE{Hash95,
-    author="Reza Hashemian",
-    title="Memory Efficient and High-Speed Search {Huffman} Coding",
-    journal="{IEEE} Transactions on Communications",
-    volume=43,
-    number=10,
-    pages="2576--2581",
-    month=Oct,
-    year=1995
-  }*/
-struct oc_huff_node{
-  /*The number of bits of the code needed to descend through this node.
-    0 indicates a leaf node.
-    Otherwise there are 1<<nbits nodes in the nodes table, which can be
-     indexed by reading nbits bits from the stream.*/
-  unsigned char  nbits;
-  /*The value of a token stored in a leaf node.
-    The value in non-leaf nodes is undefined.*/
-  unsigned char  token;
-  /*The depth of the current node, relative to its parent in the collapsed
-     tree.
-    This can be less than its parent's nbits value, in which case there are
-     1<<nbits-depth copies of this node in the table, and the bitstream should
-     only be advanced depth bits after reaching this node.*/
-  unsigned char  depth;
-  /*The table of child nodes.
-    The ACTUAL size of this array is 1<<nbits, despite what the declaration
-     below claims.
-    The exception is that for leaf nodes the size is 0.*/
-  oc_huff_node  *nodes[2];
-};
-
-
-
 int oc_huff_trees_unpack(oc_pack_buf *_opb,
- oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
-int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
- const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]);
-void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
-int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node);
-
+ ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
+int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
+ const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]);
+void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
+int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_node);

 #endif
--- a/media/libtheora/lib/huffenc.h
+++ b/media/libtheora/lib/huffenc.h
@ -1,19 +0,0 @@
-#if !defined(_huffenc_H)
-# define _huffenc_H (1)
-# include "huffman.h"
-
-
-
-typedef th_huff_code                  th_huff_table[TH_NDCT_TOKENS];
-
-
-
-extern const th_huff_code
- TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
-
-
-
-int oc_huff_codes_pack(oggpack_buffer *_opb,
- const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]);
-
-#endif
--- a/media/libtheora/lib/idct.c
+++ b/media/libtheora/lib/idct.c
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: idct.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: idct.c 17410 2010-09-21 21:53:48Z tterribe $

 ********************************************************************/

@ -231,18 +231,18 @@ static void idct8_1(ogg_int16_t *_y,const ogg_int16_t _x[1]){
  _y: The buffer to store the result in.
      This may be the same as _x.
  _x: The input coefficients.*/
-static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  const ogg_int16_t *in;
-  ogg_int16_t       *end;
-  ogg_int16_t       *out;
-  ogg_int16_t        w[64];
+static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         i;
  /*Transform rows of x into columns of w.*/
  idct8_2(w,_x);
  idct8_1(w+1,_x+8);
  /*Transform rows of w into columns of y.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_2(out,in);
+  for(i=0;i<8;i++)idct8_2(_y+i,w+i*8);
  /*Adjust for the scale factor.*/
-  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+  for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+  /*Clear input data for next block (decoder only).*/
+  if(_x!=_y)_x[0]=_x[1]=_x[8]=0;
 }

 /*Performs an inverse 8x8 Type-II DCT transform.
@ -260,20 +260,20 @@ static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
  _y: The buffer to store the result in.
      This may be the same as _x.
  _x: The input coefficients.*/
-static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  const ogg_int16_t *in;
-  ogg_int16_t       *end;
-  ogg_int16_t       *out;
-  ogg_int16_t        w[64];
+static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         i;
  /*Transform rows of x into columns of w.*/
  idct8_4(w,_x);
  idct8_3(w+1,_x+8);
  idct8_2(w+2,_x+16);
  idct8_1(w+3,_x+24);
  /*Transform rows of w into columns of y.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_4(out,in);
+  for(i=0;i<8;i++)idct8_4(_y+i,w+i*8);
  /*Adjust for the scale factor.*/
-  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+  for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+  /*Clear input data for next block (decoder only).*/
+  if(_x!=_y)_x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
 }

 /*Performs an inverse 8x8 Type-II DCT transform.
@ -282,28 +282,22 @@ static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
  _y: The buffer to store the result in.
      This may be the same as _x.
  _x: The input coefficients.*/
-static void oc_idct8x8_slow(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  const ogg_int16_t *in;
-  ogg_int16_t       *end;
-  ogg_int16_t       *out;
-  ogg_int16_t        w[64];
+static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         i;
  /*Transform rows of x into columns of w.*/
-  for(in=_x,out=w,end=out+8;out<end;in+=8,out++)idct8(out,in);
+  for(i=0;i<8;i++)idct8(w+i,_x+i*8);
  /*Transform rows of w into columns of y.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8(out,in);
+  for(i=0;i<8;i++)idct8(_y+i,w+i*8);
  /*Adjust for the scale factor.*/
-  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
-}
-
-void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
- int _last_zzi){
-  (*_state->opt_vtable.idct8x8)(_y,_last_zzi);
+  for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+  if(_x!=_y)for(i=0;i<64;i++)_x[i]=0;
 }

 /*Performs an inverse 8x8 Type-II DCT transform.
  The input is assumed to be scaled by a factor of 4 relative to orthonormal
   version of the transform.*/
-void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
  /*_last_zzi is subtly different from an actual count of the number of
     coefficients we decoded for this block.
    It contains the value of zzi BEFORE the final token in the block was
@ -329,7 +323,7 @@ void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
     gets.
    Needless to say we inherited this approach from VP3.*/
  /*Then perform the iDCT.*/
-  if(_last_zzi<3)oc_idct8x8_3(_y,_y);
-  else if(_last_zzi<10)oc_idct8x8_10(_y,_y);
-  else oc_idct8x8_slow(_y,_y);
+  if(_last_zzi<=3)oc_idct8x8_3(_y,_x);
+  else if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
+  else oc_idct8x8_slow(_y,_x);
 }
--- a/media/libtheora/lib/internal.c
+++ b/media/libtheora/lib/internal.c
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: internal.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: internal.c 17506 2010-10-13 02:52:41Z tterribe $

 ********************************************************************/

@ -97,79 +97,29 @@ int oc_ilog(unsigned _v){



-/*The function used to fill in the chroma plane motion vectors for a macro
-   block when 4 different motion vectors are specified in the luma plane.
-  This version is for use with chroma decimated in the X and Y directions
-   (4:2:0).
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
-  int dx;
-  int dy;
-  dx=_lbmvs[0][0]+_lbmvs[1][0]+_lbmvs[2][0]+_lbmvs[3][0];
-  dy=_lbmvs[0][1]+_lbmvs[1][1]+_lbmvs[2][1]+_lbmvs[3][1];
-  _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,2,2);
-  _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,2,2);
+void *oc_aligned_malloc(size_t _sz,size_t _align){
+  unsigned char *p;
+  if(_align-1>UCHAR_MAX||(_align&_align-1)||_sz>~(size_t)0-_align)return NULL;
+  p=(unsigned char *)_ogg_malloc(_sz+_align);
+  if(p!=NULL){
+    int offs;
+    offs=((p-(unsigned char *)0)-1&_align-1);
+    p[offs]=offs;
+    p+=offs+1;
+  }
+  return p;
 }

-/*The function used to fill in the chroma plane motion vectors for a macro
-   block when 4 different motion vectors are specified in the luma plane.
-  This version is for use with chroma decimated in the Y direction.
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
-  int dx;
-  int dy;
-  dx=_lbmvs[0][0]+_lbmvs[2][0];
-  dy=_lbmvs[0][1]+_lbmvs[2][1];
-  _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
-  _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
-  dx=_lbmvs[1][0]+_lbmvs[3][0];
-  dy=_lbmvs[1][1]+_lbmvs[3][1];
-  _cbmvs[1][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
-  _cbmvs[1][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
+void oc_aligned_free(void *_ptr){
+  unsigned char *p;
+  p=(unsigned char *)_ptr;
+  if(p!=NULL){
+    int offs;
+    offs=*--p;
+    _ogg_free(p-offs);
+  }
 }

-/*The function used to fill in the chroma plane motion vectors for a macro
-   block when 4 different motion vectors are specified in the luma plane.
-  This version is for use with chroma decimated in the X direction (4:2:2).
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
-  int dx;
-  int dy;
-  dx=_lbmvs[0][0]+_lbmvs[1][0];
-  dy=_lbmvs[0][1]+_lbmvs[1][1];
-  _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
-  _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
-  dx=_lbmvs[2][0]+_lbmvs[3][0];
-  dy=_lbmvs[2][1]+_lbmvs[3][1];
-  _cbmvs[2][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
-  _cbmvs[2][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
-}
-
-/*The function used to fill in the chroma plane motion vectors for a macro
-   block when 4 different motion vectors are specified in the luma plane.
-  This version is for use with no chroma decimation (4:4:4).
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lmbmv: The luma macro-block level motion vector to fill in for use in
-           prediction.
-  _lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
-  memcpy(_cbmvs,_lbmvs,4*sizeof(_lbmvs[0]));
-}
-
-/*A table of functions used to fill in the chroma plane motion vectors for a
-   macro block when 4 different motion vectors are specified in the luma
-   plane.*/
-const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]={
-  (oc_set_chroma_mvs_func)oc_set_chroma_mvs00,
-  (oc_set_chroma_mvs_func)oc_set_chroma_mvs01,
-  (oc_set_chroma_mvs_func)oc_set_chroma_mvs10,
-  (oc_set_chroma_mvs_func)oc_set_chroma_mvs11
-};
-
-

 void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){
  size_t  rowsz;
--- a/media/libtheora/lib/internal.h
+++ b/media/libtheora/lib/internal.h
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: internal.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: internal.h 17578 2010-10-29 04:21:26Z tterribe $

 ********************************************************************/
 #if !defined(_internal_H)
@ -19,10 +19,20 @@
 # include <stdlib.h>
 # include <limits.h>
 # if defined(HAVE_CONFIG_H)
-#  include <config.h>
+#  include "config.h"
 # endif
 # include "theora/codec.h"
 # include "theora/theora.h"
+# include "ocintrin.h"
+
+# if !defined(__GNUC_PREREQ)
+#  if defined(__GNUC__)&&defined(__GNUC_MINOR__)
+#   define __GNUC_PREREQ(_maj,_min) \
+ ((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min))
+#  else
+#   define __GNUC_PREREQ(_maj,_min) 0
+#  endif
+# endif

 # if defined(_MSC_VER)
 /*Disable missing EMMS warnings.*/
@ -31,24 +41,25 @@
 #  pragma warning(disable:4554)
 # endif
 /*You, too, gcc.*/
-# if defined(__GNUC_PREREQ)
-#  if __GNUC_PREREQ(4,2)
-#   pragma GCC diagnostic ignored "-Wparentheses"
-#  endif
+# if __GNUC_PREREQ(4,2)
+#  pragma GCC diagnostic ignored "-Wparentheses"
 # endif

-# include "ocintrin.h"
-# include "huffman.h"
-# include "quant.h"
-
-/*Some assembly constructs require aligned operands.*/
-# if defined(OC_X86_ASM)
+/*Some assembly constructs require aligned operands.
+  The following macros are _only_ intended for structure member declarations.
+  Although they will sometimes work on stack variables, gcc will often silently
+   ignore them.
+  A separate set of macros could be made for manual stack alignment, but we
+   don't actually require it anywhere.*/
+# if defined(OC_X86_ASM)||defined(OC_ARM_ASM)
 #  if defined(__GNUC__)
 #   define OC_ALIGN8(expr) expr __attribute__((aligned(8)))
 #   define OC_ALIGN16(expr) expr __attribute__((aligned(16)))
 #  elif defined(_MSC_VER)
 #   define OC_ALIGN8(expr) __declspec (align(8)) expr
 #   define OC_ALIGN16(expr) __declspec (align(16)) expr
+#  else
+#   error "Alignment macros required for this platform."
 #  endif
 # endif
 # if !defined(OC_ALIGN8)
@ -60,19 +71,8 @@



-typedef struct oc_sb_flags              oc_sb_flags;
-typedef struct oc_border_info           oc_border_info;
-typedef struct oc_fragment              oc_fragment;
-typedef struct oc_fragment_plane        oc_fragment_plane;
-typedef struct oc_base_opt_vtable       oc_base_opt_vtable;
-typedef struct oc_base_opt_data         oc_base_opt_data;
-typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
-typedef struct oc_theora_state          oc_theora_state;
-
-
-
 /*This library's version.*/
-# define OC_VENDOR_STRING "Xiph.Org libtheora 1.1 20090822 (Thusnelda)"
+# define OC_VENDOR_STRING "Xiph.Org libtheora 1.2.0alpha 20100924 (Ptalarbvorm)"

 /*Theora bitstream version.*/
 # define TH_VERSION_MAJOR (3)
@ -83,315 +83,6 @@ typedef struct oc_theora_state          oc_theora_state;
 ((_info)->version_minor>(_min)||(_info)->version_minor==(_min)&& \
 (_info)->version_subminor>=(_sub)))

-/*A keyframe.*/
-#define OC_INTRA_FRAME (0)
-/*A predicted frame.*/
-#define OC_INTER_FRAME (1)
-/*A frame of unknown type (frame type decision has not yet been made).*/
-#define OC_UNKWN_FRAME (-1)
-
-/*The amount of padding to add to the reconstructed frame buffers on all
-   sides.
-  This is used to allow unrestricted motion vectors without special casing.
-  This must be a multiple of 2.*/
-#define OC_UMV_PADDING (16)
-
-/*Frame classification indices.*/
-/*The previous golden frame.*/
-#define OC_FRAME_GOLD (0)
-/*The previous frame.*/
-#define OC_FRAME_PREV (1)
-/*The current frame.*/
-#define OC_FRAME_SELF (2)
-
-/*The input or output buffer.*/
-#define OC_FRAME_IO   (3)
-
-/*Macroblock modes.*/
-/*Macro block is invalid: It is never coded.*/
-#define OC_MODE_INVALID        (-1)
-/*Encoded difference from the same macro block in the previous frame.*/
-#define OC_MODE_INTER_NOMV     (0)
-/*Encoded with no motion compensated prediction.*/
-#define OC_MODE_INTRA          (1)
-/*Encoded difference from the previous frame offset by the given motion 
-  vector.*/
-#define OC_MODE_INTER_MV       (2)
-/*Encoded difference from the previous frame offset by the last coded motion 
-  vector.*/
-#define OC_MODE_INTER_MV_LAST  (3)
-/*Encoded difference from the previous frame offset by the second to last 
-  coded motion vector.*/
-#define OC_MODE_INTER_MV_LAST2 (4)
-/*Encoded difference from the same macro block in the previous golden 
-  frame.*/
-#define OC_MODE_GOLDEN_NOMV    (5)
-/*Encoded difference from the previous golden frame offset by the given motion 
-  vector.*/
-#define OC_MODE_GOLDEN_MV      (6)
-/*Encoded difference from the previous frame offset by the individual motion 
-  vectors given for each block.*/
-#define OC_MODE_INTER_MV_FOUR  (7)
-/*The number of (coded) modes.*/
-#define OC_NMODES              (8)
-
-/*Determines the reference frame used for a given MB mode.*/
-#define OC_FRAME_FOR_MODE(_x) \
- OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \
-  OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x))
-
-/*Constants for the packet state machine common between encoder and decoder.*/
-
-/*Next packet to emit/read: Codec info header.*/
-#define OC_PACKET_INFO_HDR    (-3)
-/*Next packet to emit/read: Comment header.*/
-#define OC_PACKET_COMMENT_HDR (-2)
-/*Next packet to emit/read: Codec setup header.*/
-#define OC_PACKET_SETUP_HDR   (-1)
-/*No more packets to emit/read.*/
-#define OC_PACKET_DONE        (INT_MAX)
-
-
-
-/*Super blocks are 32x32 segments of pixels in a single color plane indexed
-   in image order.
-  Internally, super blocks are broken up into four quadrants, each of which
-   contains a 2x2 pattern of blocks, each of which is an 8x8 block of pixels.
-  Quadrants, and the blocks within them, are indexed in a special order called
-   a "Hilbert curve" within the super block.
-
-  In order to differentiate between the Hilbert-curve indexing strategy and
-   the regular image order indexing strategy, blocks indexed in image order
-   are called "fragments".
-  Fragments are indexed in image order, left to right, then bottom to top,
-   from Y' plane to Cb plane to Cr plane.
-
-  The co-located fragments in all image planes corresponding to the location
-   of a single quadrant of a luma plane super block form a macro block.
-  Thus there is only a single set of macro blocks for all planes, each of which
-   contains between 6 and 12 fragments, depending on the pixel format.
-  Therefore macro block information is kept in a separate set of arrays from
-   super blocks to avoid unused space in the other planes.
-  The lists are indexed in super block order.
-  That is, the macro block corresponding to the macro block mbi in (luma plane)
-   super block sbi is at index (sbi<<2|mbi).
-  Thus the number of macro blocks in each dimension is always twice the number
-   of super blocks, even when only an odd number fall inside the coded frame.
-  These "extra" macro blocks are just an artifact of our internal data layout,
-   and not part of the coded stream; they are flagged with a negative MB mode.*/
-
-
-
-/*A single quadrant of the map from a super block to fragment numbers.*/
-typedef ptrdiff_t       oc_sb_map_quad[4];
-/*A map from a super block to fragment numbers.*/
-typedef oc_sb_map_quad  oc_sb_map[4];
-/*A single plane of the map from a macro block to fragment numbers.*/
-typedef ptrdiff_t       oc_mb_map_plane[4];
-/*A map from a macro block to fragment numbers.*/
-typedef oc_mb_map_plane oc_mb_map[3];
-/*A motion vector.*/
-typedef signed char     oc_mv[2];
-
-
-
-/*Super block information.*/
-struct oc_sb_flags{
-  unsigned char coded_fully:1;
-  unsigned char coded_partially:1;
-  unsigned char quad_valid:4;
-};
-
-
-
-/*Information about a fragment which intersects the border of the displayable
-   region.
-  This marks which pixels belong to the displayable region.*/
-struct oc_border_info{
-  /*A bit mask marking which pixels are in the displayable region.
-    Pixel (x,y) corresponds to bit (y<<3|x).*/
-  ogg_int64_t mask;
-  /*The number of pixels in the displayable region.
-    This is always positive, and always less than 64.*/
-  int         npixels;
-};
-
-
-
-/*Fragment information.*/
-struct oc_fragment{
-  /*A flag indicating whether or not this fragment is coded.*/
-  unsigned   coded:1;
-  /*A flag indicating that this entire fragment lies outside the displayable
-     region of the frame.
-    Note the contrast with an invalid macro block, which is outside the coded
-     frame, not just the displayable one.
-    There are no fragments outside the coded frame by construction.*/
-  unsigned   invalid:1;
-  /*The index of the quality index used for this fragment's AC coefficients.*/
-  unsigned   qii:6;
-  /*The mode of the macroblock this fragment belongs to.*/
-  unsigned   mb_mode:3;
-  /*The index of the associated border information for fragments which lie
-     partially outside the displayable region.
-    For fragments completely inside or outside this region, this is -1.
-    Note that the C standard requires an explicit signed keyword for bitfield
-     types, since some compilers may treat them as unsigned without it.*/
-  signed int borderi:5;
-  /*The prediction-corrected DC component.
-    Note that the C standard requires an explicit signed keyword for bitfield
-     types, since some compilers may treat them as unsigned without it.*/
-  signed int dc:16;
-};
-
-
-
-/*A description of each fragment plane.*/
-struct oc_fragment_plane{
-  /*The number of fragments in the horizontal direction.*/
-  int       nhfrags;
-  /*The number of fragments in the vertical direction.*/
-  int       nvfrags;
-  /*The offset of the first fragment in the plane.*/
-  ptrdiff_t froffset;
-  /*The total number of fragments in the plane.*/
-  ptrdiff_t nfrags;
-  /*The number of super blocks in the horizontal direction.*/
-  unsigned  nhsbs;
-  /*The number of super blocks in the vertical direction.*/
-  unsigned  nvsbs;
-  /*The offset of the first super block in the plane.*/
-  unsigned  sboffset;
-  /*The total number of super blocks in the plane.*/
-  unsigned  nsbs;
-};
-
-
-
-/*The shared (encoder and decoder) functions that have accelerated variants.*/
-struct oc_base_opt_vtable{
-  void (*frag_copy)(unsigned char *_dst,
-   const unsigned char *_src,int _ystride);
-  void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
-   const ogg_int16_t _residue[64]);
-  void (*frag_recon_inter)(unsigned char *_dst,
-   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
-  void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
-   const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
-  void (*idct8x8)(ogg_int16_t _y[64],int _last_zzi);
-  void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
-   int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-  void (*state_frag_copy_list)(const oc_theora_state *_state,
-   const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
-   int _dst_frame,int _src_frame,int _pli);
-  void (*state_loop_filter_frag_rows)(const oc_theora_state *_state,
-   int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);  
-  void (*restore_fpu)(void);
-};
-
-/*The shared (encoder and decoder) tables that vary according to which variants
-   of the above functions are used.*/
-struct oc_base_opt_data{
-  const unsigned char *dct_fzig_zag;
-};
-
-
-/*State information common to both the encoder and decoder.*/
-struct oc_theora_state{
-  /*The stream information.*/
-  th_info             info;
-  /*Table for shared accelerated functions.*/
-  oc_base_opt_vtable  opt_vtable;
-  /*Table for shared data used by accelerated functions.*/
-  oc_base_opt_data    opt_data;
-  /*CPU flags to detect the presence of extended instruction sets.*/
-  ogg_uint32_t        cpu_flags;
-  /*The fragment plane descriptions.*/
-  oc_fragment_plane   fplanes[3];
-  /*The list of fragments, indexed in image order.*/
-  oc_fragment        *frags;
-  /*The the offset into the reference frame buffer to the upper-left pixel of
-     each fragment.*/
-  ptrdiff_t          *frag_buf_offs;
-  /*The motion vector for each fragment.*/
-  oc_mv              *frag_mvs;
-  /*The total number of fragments in a single frame.*/
-  ptrdiff_t           nfrags;
-  /*The list of super block maps, indexed in image order.*/
-  oc_sb_map          *sb_maps;
-  /*The list of super block flags, indexed in image order.*/
-  oc_sb_flags        *sb_flags;
-  /*The total number of super blocks in a single frame.*/
-  unsigned            nsbs;
-  /*The fragments from each color plane that belong to each macro block.
-    Fragments are stored in image order (left to right then top to bottom).
-    When chroma components are decimated, the extra fragments have an index of
-     -1.*/
-  oc_mb_map          *mb_maps;
-  /*The list of macro block modes.
-    A negative number indicates the macro block lies entirely outside the
-     coded frame.*/
-  signed char        *mb_modes;
-  /*The number of macro blocks in the X direction.*/
-  unsigned            nhmbs;
-  /*The number of macro blocks in the Y direction.*/
-  unsigned            nvmbs;
-  /*The total number of macro blocks.*/
-  size_t              nmbs;
-  /*The list of coded fragments, in coded order.
-    Uncoded fragments are stored in reverse order from the end of the list.*/
-  ptrdiff_t          *coded_fragis;
-  /*The number of coded fragments in each plane.*/
-  ptrdiff_t           ncoded_fragis[3];
-  /*The total number of coded fragments.*/
-  ptrdiff_t           ntotal_coded_fragis;
-  /*The index of the buffers being used for each OC_FRAME_* reference frame.*/
-  int                 ref_frame_idx[4];
-  /*The actual buffers used for the previously decoded frames.*/
-  th_ycbcr_buffer     ref_frame_bufs[4];
-  /*The storage for the reference frame buffers.*/
-  unsigned char      *ref_frame_data[4];
-  /*The strides for each plane in the reference frames.*/
-  int                 ref_ystride[3];
-  /*The number of unique border patterns.*/
-  int                 nborders;
-  /*The unique border patterns for all border fragments.
-    The borderi field of fragments which straddle the border indexes this
-     list.*/
-  oc_border_info      borders[16];
-  /*The frame number of the last keyframe.*/
-  ogg_int64_t         keyframe_num;
-  /*The frame number of the current frame.*/
-  ogg_int64_t         curframe_num;
-  /*The granpos of the current frame.*/
-  ogg_int64_t         granpos;
-  /*The type of the current frame.*/
-  unsigned char       frame_type;
-  /*The bias to add to the frame count when computing granule positions.*/
-  unsigned char       granpos_bias;
-  /*The number of quality indices used in the current frame.*/
-  unsigned char       nqis;
-  /*The quality indices of the current frame.*/
-  unsigned char       qis[3];
-  /*The dequantization tables, stored in zig-zag order, and indexed by
-     qi, pli, qti, and zzi.*/
-  ogg_uint16_t       *dequant_tables[64][3][2];
-  OC_ALIGN16(oc_quant_table      dequant_table_data[64][3][2]);
-  /*Loop filter strength parameters.*/
-  unsigned char       loop_filter_limits[64];
-};
-
-
-
-/*The function type used to fill in the chroma plane motion vectors for a
-   macro block when 4 different motion vectors are specified in the luma
-   plane.
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lmbmv: The luma macro-block level motion vector to fill in for use in
-           prediction.
-  _lbmvs: The luma block-level motion vectors.*/
-typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]);
-


 /*A map from the index in the zig zag scan to the coefficient number in a
@ -409,14 +100,12 @@ extern const unsigned char OC_MB_MAP_IDXS[TH_PF_NFORMATS][12];
 /*The number of indices in the oc_mb_map array that can be valid for each of
   the various chroma decimation types.*/
 extern const unsigned char OC_MB_MAP_NIDXS[TH_PF_NFORMATS];
-/*A table of functions used to fill in the Cb,Cr plane motion vectors for a
-   macro block when 4 different motion vectors are specified in the luma
-   plane.*/
-extern const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS];



 int oc_ilog(unsigned _v);
+void *oc_aligned_malloc(size_t _sz,size_t _align);
+void oc_aligned_free(void *_ptr);
 void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz);
 void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz);
 void oc_free_2d(void *_ptr);
@ -424,86 +113,4 @@ void oc_free_2d(void *_ptr);
 void oc_ycbcr_buffer_flip(th_ycbcr_buffer _dst,
 const th_ycbcr_buffer _src);

-int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
-void oc_state_clear(oc_theora_state *_state);
-void oc_state_vtable_init_c(oc_theora_state *_state);
-void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli,
- int _y0,int _yend);
-void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli);
-void oc_state_borders_fill(oc_theora_state *_state,int _refi);
-void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx,
- th_ycbcr_buffer _img);
-int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
-int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
- int _pli,int _dx,int _dy);
-
-int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
-void oc_state_loop_filter(oc_theora_state *_state,int _frame);
-#if defined(OC_DUMP_IMAGES)
-int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
- const char *_suf);
-#endif
-
-/*Shared accelerated functions.*/
-void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst,
- const unsigned char *_src,int _ystride);
-void oc_frag_recon_intra(const oc_theora_state *_state,
- unsigned char *_dst,int _dst_ystride,const ogg_int16_t _residue[64]);
-void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
- const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
-void oc_frag_recon_inter2(const oc_theora_state *_state,
- unsigned char *_dst,const unsigned char *_src1,const unsigned char *_src2,
- int _ystride,const ogg_int16_t _residue[64]);
-void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],int _last_zzi);
-void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
-void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
-void oc_restore_fpu(const oc_theora_state *_state);
-
-/*Default pure-C implementations.*/
-void oc_frag_copy_c(unsigned char *_dst,
- const unsigned char *_src,int _src_ystride);
-void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
- const ogg_int16_t _residue[64]);
-void oc_frag_recon_inter_c(unsigned char *_dst,
- const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
-void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
- const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
-void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi);
-void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_c(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
-void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
-void oc_restore_fpu_c(void);
-
-/*We need a way to call a few encoder functions without introducing a link-time
-   dependency into the decoder, while still allowing the old alpha API which
-   does not distinguish between encoder and decoder objects to be used.
-  We do this by placing a function table at the start of the encoder object
-   which can dispatch into the encoder library.
-  We do a similar thing for the decoder in case we ever decide to split off a
-   common base library.*/
-typedef void (*oc_state_clear_func)(theora_state *_th);
-typedef int (*oc_state_control_func)(theora_state *th,int _req,
- void *_buf,size_t _buf_sz);
-typedef ogg_int64_t (*oc_state_granule_frame_func)(theora_state *_th,
- ogg_int64_t _granulepos);
-typedef double (*oc_state_granule_time_func)(theora_state *_th,
- ogg_int64_t _granulepos);
-
-
-struct oc_state_dispatch_vtable{
-  oc_state_clear_func         clear;
-  oc_state_control_func       control;
-  oc_state_granule_frame_func granule_frame;
-  oc_state_granule_time_func  granule_time;
-};
-
 #endif
--- a/media/libtheora/lib/mathops.h
+++ b/media/libtheora/lib/mathops.h
@ -2,29 +2,27 @@
 # define _mathops_H (1)
 # include <ogg/ogg.h>

-# ifdef __GNUC_PREREQ
-#  if __GNUC_PREREQ(3,4)
-#   include <limits.h>
+# if __GNUC_PREREQ(3,4)
+#  include <limits.h>
 /*Note the casts to (int) below: this prevents OC_CLZ{32|64}_OFFS from
   "upgrading" the type of an entire expression to an (unsigned) size_t.*/
-#   if INT_MAX>=2147483647
-#    define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
-#    define OC_CLZ32(_x) (__builtin_clz(_x))
-#   elif LONG_MAX>=2147483647L
-#    define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
-#    define OC_CLZ32(_x) (__builtin_clzl(_x))
-#   endif
-#   if INT_MAX>=9223372036854775807LL
-#    define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
-#    define OC_CLZ64(_x) (__builtin_clz(_x))
-#   elif LONG_MAX>=9223372036854775807LL
-#    define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
-#    define OC_CLZ64(_x) (__builtin_clzl(_x))
-#   elif LLONG_MAX>=9223372036854775807LL|| \
-     __LONG_LONG_MAX__>=9223372036854775807LL
-#    define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT)
-#    define OC_CLZ64(_x) (__builtin_clzll(_x))
-#   endif
+#  if INT_MAX>=2147483647
+#   define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
+#   define OC_CLZ32(_x) (__builtin_clz(_x))
+#  elif LONG_MAX>=2147483647L
+#   define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
+#   define OC_CLZ32(_x) (__builtin_clzl(_x))
+#  endif
+#  if INT_MAX>=9223372036854775807LL
+#   define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
+#   define OC_CLZ64(_x) (__builtin_clz(_x))
+#  elif LONG_MAX>=9223372036854775807LL
+#   define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
+#   define OC_CLZ64(_x) (__builtin_clzl(_x))
+#  elif LLONG_MAX>=9223372036854775807LL|| \
+    __LONG_LONG_MAX__>=9223372036854775807LL
+#   define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT)
+#   define OC_CLZ64(_x) (__builtin_clzll(_x))
 #  endif
 # endif

@ -134,8 +132,12 @@ int oc_ilog64(ogg_int64_t _v);
 # define OC_STATIC_ILOG_64(_v) (OC_STATIC_ILOG6((ogg_int64_t)(_v)))

 #define OC_Q57(_v) ((ogg_int64_t)(_v)<<57)
+#define OC_Q10(_v) ((_v)<<10)

 ogg_int64_t oc_bexp64(ogg_int64_t _z);
 ogg_int64_t oc_blog64(ogg_int64_t _w);

+ogg_uint32_t oc_bexp32_q10(int _z);
+int oc_blog32_q10(ogg_uint32_t _w);
+
 #endif
--- a/media/libtheora/lib/quant.c
+++ b/media/libtheora/lib/quant.c
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: quant.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: quant.c 17307 2010-06-27 06:02:15Z tterribe $

 ********************************************************************/

@ -21,6 +21,14 @@
 #include "quant.h"
 #include "decint.h"

+/*The maximum output of the DCT with +/- 255 inputs is +/- 8157.
+  These minimum quantizers ensure the result after quantization (and after
+   prediction for DC) will be no more than +/- 510.
+  The tokenization system can handle values up to +/- 580, so there is no need
+   to do any coefficient clamping.
+  I would rather have allowed smaller quantizers and had to clamp, but these
+   minimums were required when constructing the original VP3 matrices and have
+   been formalized in the spec.*/
 static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
 static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};

--- a/media/libtheora/lib/state.c
+++ b/media/libtheora/lib/state.c
@ -11,25 +11,92 @@
 ********************************************************************

  function:
-    last mod: $Id: state.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: state.c 17576 2010-10-29 01:07:51Z tterribe $

 ********************************************************************/

 #include <stdlib.h>
 #include <string.h>
-#include "internal.h"
-#if defined(OC_X86_ASM)
-#if defined(_MSC_VER)
-# include "x86_vc/x86int.h"
-#else
-# include "x86/x86int.h"
-#endif
-#endif
+#include "state.h"
 #if defined(OC_DUMP_IMAGES)
 # include <stdio.h>
 # include "png.h"
 #endif

+/*The function used to fill in the chroma plane motion vectors for a macro
+   block when 4 different motion vectors are specified in the luma plane.
+  This version is for use with chroma decimated in the X and Y directions
+   (4:2:0).
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lbmvs: The luma block-level motion vectors.*/
+static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+  int dx;
+  int dy;
+  dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[1])
+   +OC_MV_X(_lbmvs[2])+OC_MV_X(_lbmvs[3]);
+  dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[1])
+   +OC_MV_Y(_lbmvs[2])+OC_MV_Y(_lbmvs[3]);
+  _cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,2,2),OC_DIV_ROUND_POW2(dy,2,2));
+}
+
+/*The function used to fill in the chroma plane motion vectors for a macro
+   block when 4 different motion vectors are specified in the luma plane.
+  This version is for use with chroma decimated in the Y direction.
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lbmvs: The luma block-level motion vectors.*/
+static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+  int dx;
+  int dy;
+  dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[2]);
+  dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[2]);
+  _cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
+  dx=OC_MV_X(_lbmvs[1])+OC_MV_X(_lbmvs[3]);
+  dy=OC_MV_Y(_lbmvs[1])+OC_MV_Y(_lbmvs[3]);
+  _cbmvs[1]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
+}
+
+/*The function used to fill in the chroma plane motion vectors for a macro
+   block when 4 different motion vectors are specified in the luma plane.
+  This version is for use with chroma decimated in the X direction (4:2:2).
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lbmvs: The luma block-level motion vectors.*/
+static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+  int dx;
+  int dy;
+  dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[1]);
+  dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[1]);
+  _cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
+  dx=OC_MV_X(_lbmvs[2])+OC_MV_X(_lbmvs[3]);
+  dy=OC_MV_Y(_lbmvs[2])+OC_MV_Y(_lbmvs[3]);
+  _cbmvs[2]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
+}
+
+/*The function used to fill in the chroma plane motion vectors for a macro
+   block when 4 different motion vectors are specified in the luma plane.
+  This version is for use with no chroma decimation (4:4:4).
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lmbmv: The luma macro-block level motion vector to fill in for use in
+           prediction.
+  _lbmvs: The luma block-level motion vectors.*/
+static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+  _cbmvs[0]=_lbmvs[0];
+  _cbmvs[1]=_lbmvs[1];
+  _cbmvs[2]=_lbmvs[2];
+  _cbmvs[3]=_lbmvs[3];
+}
+
+/*A table of functions used to fill in the chroma plane motion vectors for a
+   macro block when 4 different motion vectors are specified in the luma
+   plane.*/
+const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]={
+  (oc_set_chroma_mvs_func)oc_set_chroma_mvs00,
+  (oc_set_chroma_mvs_func)oc_set_chroma_mvs01,
+  (oc_set_chroma_mvs_func)oc_set_chroma_mvs10,
+  (oc_set_chroma_mvs_func)oc_set_chroma_mvs11
+};
+
+
+
 /*Returns the fragment index of the top-left block in a macro block.
  This can be used to test whether or not the whole macro block is valid.
  _sb_map: The super block map.
@ -469,7 +536,7 @@ static void oc_state_frarray_clear(oc_theora_state *_state){
   unrestricted motion vectors without special casing the boundary.
  If chroma is decimated in either direction, the padding is reduced by a
   factor of 2 on the appropriate sides.
-  _nrefs: The number of reference buffers to init; must be 3 or 4.*/
+  _nrefs: The number of reference buffers to init; must be in the range 3...6.*/
 static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
  th_info       *info;
  unsigned char *ref_frame_data;
@ -481,6 +548,7 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
  int            yheight;
  int            chstride;
  int            cheight;
+  ptrdiff_t      align;
  ptrdiff_t      yoffset;
  ptrdiff_t      coffset;
  ptrdiff_t     *frag_buf_offs;
@ -489,28 +557,33 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
  int            vdec;
  int            rfi;
  int            pli;
-  if(_nrefs<3||_nrefs>4)return TH_EINVAL;
+  if(_nrefs<3||_nrefs>6)return TH_EINVAL;
  info=&_state->info;
  /*Compute the image buffer parameters for each plane.*/
  hdec=!(info->pixel_fmt&1);
  vdec=!(info->pixel_fmt&2);
  yhstride=info->frame_width+2*OC_UMV_PADDING;
  yheight=info->frame_height+2*OC_UMV_PADDING;
-  chstride=yhstride>>hdec;
+  /*Require 16-byte aligned rows in the chroma planes.*/
+  chstride=(yhstride>>hdec)+15&~15;
  cheight=yheight>>vdec;
  yplane_sz=yhstride*(size_t)yheight;
  cplane_sz=chstride*(size_t)cheight;
  yoffset=OC_UMV_PADDING+OC_UMV_PADDING*(ptrdiff_t)yhstride;
  coffset=(OC_UMV_PADDING>>hdec)+(OC_UMV_PADDING>>vdec)*(ptrdiff_t)chstride;
-  ref_frame_sz=yplane_sz+2*cplane_sz;
+  /*Although we guarantee the rows of the chroma planes are a multiple of 16
+     bytes, the initial padding on the first row may only be 8 bytes.
+    Compute the offset needed to the actual image data to a multiple of 16.*/
+  align=-coffset&15;
+  ref_frame_sz=yplane_sz+2*cplane_sz+16;
  ref_frame_data_sz=_nrefs*ref_frame_sz;
  /*Check for overflow.
    The same caveats apply as for oc_state_frarray_init().*/
-  if(yplane_sz/yhstride!=yheight||2*cplane_sz<cplane_sz||
+  if(yplane_sz/yhstride!=(size_t)yheight||2*cplane_sz+16<cplane_sz||
   ref_frame_sz<yplane_sz||ref_frame_data_sz/_nrefs!=ref_frame_sz){
    return TH_EIMPL;
  }
-  ref_frame_data=_ogg_malloc(ref_frame_data_sz);
+  ref_frame_data=oc_aligned_malloc(ref_frame_data_sz,16);
  frag_buf_offs=_state->frag_buf_offs=
   _ogg_malloc(_state->nfrags*sizeof(*frag_buf_offs));
  if(ref_frame_data==NULL||frag_buf_offs==NULL){
@ -532,15 +605,15 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
    memcpy(_state->ref_frame_bufs[rfi],_state->ref_frame_bufs[0],
     sizeof(_state->ref_frame_bufs[0]));
  }
+  _state->ref_frame_handle=ref_frame_data;
  /*Set up the data pointers for the image buffers.*/
  for(rfi=0;rfi<_nrefs;rfi++){
-    _state->ref_frame_data[rfi]=ref_frame_data;
    _state->ref_frame_bufs[rfi][0].data=ref_frame_data+yoffset;
-    ref_frame_data+=yplane_sz;
+    ref_frame_data+=yplane_sz+align;
    _state->ref_frame_bufs[rfi][1].data=ref_frame_data+coffset;
    ref_frame_data+=cplane_sz;
    _state->ref_frame_bufs[rfi][2].data=ref_frame_data+coffset;
-    ref_frame_data+=cplane_sz;
+    ref_frame_data+=cplane_sz+(16-align);
    /*Flip the buffer upside down.
      This allows us to decode Theora's bottom-up frames in their natural
       order, yet return a top-down buffer with a positive stride to the user.*/
@ -550,7 +623,7 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
  _state->ref_ystride[0]=-yhstride;
  _state->ref_ystride[1]=_state->ref_ystride[2]=-chstride;
  /*Initialize the fragment buffer offsets.*/
-  ref_frame_data=_state->ref_frame_data[0];
+  ref_frame_data=_state->ref_frame_bufs[0][0].data;
  fragi=0;
  for(pli=0;pli<3;pli++){
    th_img_plane      *iplane;
@ -576,41 +649,44 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
      vpix+=stride<<3;
    }
  }
-  /*Initialize the reference frame indices.*/
+  /*Initialize the reference frame pointers and indices.*/
  _state->ref_frame_idx[OC_FRAME_GOLD]=
   _state->ref_frame_idx[OC_FRAME_PREV]=
-   _state->ref_frame_idx[OC_FRAME_SELF]=-1;
-  _state->ref_frame_idx[OC_FRAME_IO]=_nrefs>3?3:-1;
+   _state->ref_frame_idx[OC_FRAME_GOLD_ORIG]=
+   _state->ref_frame_idx[OC_FRAME_PREV_ORIG]=
+   _state->ref_frame_idx[OC_FRAME_SELF]=
+   _state->ref_frame_idx[OC_FRAME_IO]=-1;
+  _state->ref_frame_data[OC_FRAME_GOLD]=
+   _state->ref_frame_data[OC_FRAME_PREV]=
+   _state->ref_frame_data[OC_FRAME_GOLD_ORIG]=
+   _state->ref_frame_data[OC_FRAME_PREV_ORIG]=
+   _state->ref_frame_data[OC_FRAME_SELF]=
+   _state->ref_frame_data[OC_FRAME_IO]=NULL;
  return 0;
 }

 static void oc_state_ref_bufs_clear(oc_theora_state *_state){
  _ogg_free(_state->frag_buf_offs);
-  _ogg_free(_state->ref_frame_data[0]);
+  oc_aligned_free(_state->ref_frame_handle);
 }


-void oc_state_vtable_init_c(oc_theora_state *_state){
+void oc_state_accel_init_c(oc_theora_state *_state){
+  _state->cpu_flags=0;
+#if defined(OC_STATE_USE_VTABLE)
  _state->opt_vtable.frag_copy=oc_frag_copy_c;
+  _state->opt_vtable.frag_copy_list=oc_frag_copy_list_c;
  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
  _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
  _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
  _state->opt_vtable.idct8x8=oc_idct8x8_c;
  _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
-  _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c;
+  _state->opt_vtable.loop_filter_init=oc_loop_filter_init_c;
  _state->opt_vtable.state_loop_filter_frag_rows=
   oc_state_loop_filter_frag_rows_c;
  _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
-  _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
-}
-
-/*Initialize the accelerated function pointers.*/
-void oc_state_vtable_init(oc_theora_state *_state){
-#if defined(OC_X86_ASM)
-  oc_state_vtable_init_x86(_state);
-#else
-  oc_state_vtable_init_c(_state);
 #endif
+  _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
 }


@ -648,7 +724,7 @@ int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){
     system.*/
  _state->info.pic_y=_info->frame_height-_info->pic_height-_info->pic_y;
  _state->frame_type=OC_UNKWN_FRAME;
-  oc_state_vtable_init(_state);
+  oc_state_accel_init(_state);
  ret=oc_state_frarray_init(_state);
  if(ret>=0)ret=oc_state_ref_bufs_init(_state,_nrefs);
  if(ret<0){
@ -758,11 +834,10 @@ void oc_state_borders_fill(oc_theora_state *_state,int _refi){
            _offsets[1] is set if the motion vector has non-zero fractional
             components.
  _pli:     The color plane index.
-  _dx:      The X component of the motion vector.
-  _dy:      The Y component of the motion vector.
+  _mv:      The motion vector.
  Return: The number of offsets returned: 1 or 2.*/
 int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
- int _pli,int _dx,int _dy){
+ int _pli,oc_mv _mv){
  /*Here is a brief description of how Theora handles motion vectors:
    Motion vector components are specified to half-pixel accuracy in
     undecimated directions of each plane, and quarter-pixel accuracy in
@ -785,21 +860,25 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
  int xfrac;
  int yfrac;
  int offs;
+  int dx;
+  int dy;
  ystride=_state->ref_ystride[_pli];
  /*These two variables decide whether we are in half- or quarter-pixel
     precision in each component.*/
  xprec=1+(_pli!=0&&!(_state->info.pixel_fmt&1));
  yprec=1+(_pli!=0&&!(_state->info.pixel_fmt&2));
+  dx=OC_MV_X(_mv);
+  dy=OC_MV_Y(_mv);
  /*These two variables are either 0 if all the fractional bits are zero or -1
     if any of them are non-zero.*/
-  xfrac=OC_SIGNMASK(-(_dx&(xprec|1)));
-  yfrac=OC_SIGNMASK(-(_dy&(yprec|1)));
-  offs=(_dx>>xprec)+(_dy>>yprec)*ystride;
+  xfrac=OC_SIGNMASK(-(dx&(xprec|1)));
+  yfrac=OC_SIGNMASK(-(dy&(yprec|1)));
+  offs=(dx>>xprec)+(dy>>yprec)*ystride;
  if(xfrac||yfrac){
    int xmask;
    int ymask;
-    xmask=OC_SIGNMASK(_dx);
-    ymask=OC_SIGNMASK(_dy);
+    xmask=OC_SIGNMASK(dx);
+    ymask=OC_SIGNMASK(dy);
    yfrac&=ystride;
    _offsets[0]=offs-(xfrac&xmask)+(yfrac&ymask);
    _offsets[1]=offs-(xfrac&~xmask)+(yfrac&~ymask);
@ -848,13 +927,17 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
  int mx2;
  int my2;
  int offs;
+  int dx;
+  int dy;
  ystride=_state->ref_ystride[_pli];
  qpy=_pli!=0&&!(_state->info.pixel_fmt&2);
-  my=OC_MVMAP[qpy][_dy+31];
-  my2=OC_MVMAP2[qpy][_dy+31];
+  dx=OC_MV_X(_mv);
+  dy=OC_MV_Y(_mv);
+  my=OC_MVMAP[qpy][dy+31];
+  my2=OC_MVMAP2[qpy][dy+31];
  qpx=_pli!=0&&!(_state->info.pixel_fmt&1);
-  mx=OC_MVMAP[qpx][_dx+31];
-  mx2=OC_MVMAP2[qpx][_dx+31];
+  mx=OC_MVMAP[qpx][dx+31];
+  mx2=OC_MVMAP2[qpx][dx+31];
  offs=my*ystride+mx;
  if(mx2||my2){
    _offsets[1]=offs+my2*ystride+mx2;
@ -866,18 +949,12 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
 #endif
 }

-void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
-  _state->opt_vtable.state_frag_recon(_state,_fragi,_pli,_dct_coeffs,
-   _last_zzi,_dc_quant);
-}
-
 void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
  unsigned char *dst;
  ptrdiff_t      frag_buf_off;
  int            ystride;
-  int            mb_mode;
+  int            refi;
  /*Apply the inverse transform.*/
  /*Special case only having a DC component.*/
  if(_last_zzi<2){
@ -887,69 +964,35 @@ void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
       no iDCT rounding.*/
    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
    /*LOOP VECTORIZES.*/
-    for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
+    for(ci=0;ci<64;ci++)_dct_coeffs[64+ci]=p;
  }
  else{
    /*First, dequantize the DC coefficient.*/
    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8(_state,_dct_coeffs,_last_zzi);
+    oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
  }
  /*Fill in the target buffer.*/
  frag_buf_off=_state->frag_buf_offs[_fragi];
-  mb_mode=_state->frags[_fragi].mb_mode;
+  refi=_state->frags[_fragi].refi;
  ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs);
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs+64);
  else{
    const unsigned char *ref;
    int                  mvoffsets[2];
-    ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
-     +frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
-     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+     _state->frag_mvs[_fragi])>1){
      oc_frag_recon_inter2(_state,
-       dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs);
+       dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs+64);
+    }
+    else{
+      oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
    }
-    else oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs);
  }
 }

-/*Copies the fragments specified by the lists of fragment indices from one
-   frame to another.
-  _fragis:    A pointer to a list of fragment indices.
-  _nfragis:   The number of fragment indices to copy.
-  _dst_frame: The reference frame to copy to.
-  _src_frame: The reference frame to copy from.
-  _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy_list(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  _state->opt_vtable.state_frag_copy_list(_state,_fragis,_nfragis,_dst_frame,
-   _src_frame,_pli);
-}
-
-void oc_state_frag_copy_list_c(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  const ptrdiff_t     *frag_buf_offs;
-  const unsigned char *src_frame_data;
-  unsigned char       *dst_frame_data;
-  ptrdiff_t            fragii;
-  int                  ystride;
-  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
-  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_ystride[_pli];
-  frag_buf_offs=_state->frag_buf_offs;
-  for(fragii=0;fragii<_nfragis;fragii++){
-    ptrdiff_t frag_buf_off;
-    frag_buf_off=frag_buf_offs[_fragis[fragii]];
-    oc_frag_copy(_state,dst_frame_data+frag_buf_off,
-     src_frame_data+frag_buf_off,ystride);
-  }
-}
-
-static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_h(unsigned char *_pix,int _ystride,signed char *_bv){
  int y;
  _pix-=2;
  for(y=0;y<8;y++){
@ -965,7 +1008,7 @@ static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
  }
 }

-static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_v(unsigned char *_pix,int _ystride,signed char *_bv){
  int x;
  _pix-=_ystride*2;
  for(x=0;x<8;x++){
@ -982,20 +1025,16 @@ static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){

 /*Initialize the bounding values array used by the loop filter.
  _bv: Storage for the array.
-  Return: 0 on success, or a non-zero value if no filtering need be applied.*/
-int oc_state_loop_filter_init(oc_theora_state *_state,int _bv[256]){
-  int flimit;
+  _flimit: The filter limit as defined in Section 7.10 of the spec.*/
+void oc_loop_filter_init_c(signed char _bv[256],int _flimit){
  int i;
-  flimit=_state->loop_filter_limits[_state->qis[0]];
-  if(flimit==0)return 1;
  memset(_bv,0,sizeof(_bv[0])*256);
-  for(i=0;i<flimit;i++){
-    if(127-i-flimit>=0)_bv[127-i-flimit]=i-flimit;
-    _bv[127-i]=-i;
-    _bv[127+i]=i;
-    if(127+i+flimit<256)_bv[127+i+flimit]=flimit-i;
+  for(i=0;i<_flimit;i++){
+    if(127-i-_flimit>=0)_bv[127-i-_flimit]=(signed char)(i-_flimit);
+    _bv[127-i]=(signed char)(-i);
+    _bv[127+i]=(signed char)(i);
+    if(127+i+_flimit<256)_bv[127+i+_flimit]=(signed char)(_flimit-i);
  }
-  return 0;
 }

 /*Apply the loop filter to a given set of fragment rows in the given plane.
@ -1006,14 +1045,8 @@ int oc_state_loop_filter_init(oc_theora_state *_state,int _bv[256]){
  _pli:       The color plane to filter.
  _fragy0:    The Y coordinate of the first fragment row to filter.
  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
-void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,int _bv[256],
- int _refi,int _pli,int _fragy0,int _fragy_end){
-  _state->opt_vtable.state_loop_filter_frag_rows(_state,_bv,_refi,_pli,
-   _fragy0,_fragy_end);
-}
-
-void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
- int _refi,int _pli,int _fragy0,int _fragy_end){
+void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
+ signed char *_bv,int _refi,int _pli,int _fragy0,int _fragy_end){
  const oc_fragment_plane *fplane;
  const oc_fragment       *frags;
  const ptrdiff_t         *frag_buf_offs;
@ -1030,7 +1063,7 @@ void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
  fragi_top=fplane->froffset;
  fragi_bot=fragi_top+fplane->nfrags;
  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
-  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
  ystride=_state->ref_ystride[_pli];
  frags=_state->frags;
  frag_buf_offs=_state->frag_buf_offs;
--- a/media/libtheora/lib/state.h
+++ b/media/libtheora/lib/state.h
@ -0,0 +1,552 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: internal.h 17337 2010-07-19 16:08:54Z tterribe $
+
+ ********************************************************************/
+#if !defined(_state_H)
+# define _state_H (1)
+# include "internal.h"
+# include "huffman.h"
+# include "quant.h"
+
+
+
+/*A single quadrant of the map from a super block to fragment numbers.*/
+typedef ptrdiff_t       oc_sb_map_quad[4];
+/*A map from a super block to fragment numbers.*/
+typedef oc_sb_map_quad  oc_sb_map[4];
+/*A single plane of the map from a macro block to fragment numbers.*/
+typedef ptrdiff_t       oc_mb_map_plane[4];
+/*A map from a macro block to fragment numbers.*/
+typedef oc_mb_map_plane oc_mb_map[3];
+/*A motion vector.*/
+typedef ogg_int16_t     oc_mv;
+
+typedef struct oc_sb_flags              oc_sb_flags;
+typedef struct oc_border_info           oc_border_info;
+typedef struct oc_fragment              oc_fragment;
+typedef struct oc_fragment_plane        oc_fragment_plane;
+typedef struct oc_base_opt_vtable       oc_base_opt_vtable;
+typedef struct oc_base_opt_data         oc_base_opt_data;
+typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
+typedef struct oc_theora_state          oc_theora_state;
+
+
+
+/*Shared accelerated functions.*/
+# if defined(OC_X86_ASM)
+#  if defined(_MSC_VER)
+#   include "x86_vc/x86int.h"
+#  else
+#   include "x86/x86int.h"
+#  endif
+# endif
+# if defined(OC_ARM_ASM)
+#  include "arm/armint.h"
+# endif
+# if defined(OC_C64X_ASM)
+#  include "c64x/c64xint.h"
+# endif
+
+# if !defined(oc_state_accel_init)
+#  define oc_state_accel_init oc_state_accel_init_c
+# endif
+# if defined(OC_STATE_USE_VTABLE)
+#  if !defined(oc_frag_copy)
+#   define oc_frag_copy(_state,_dst,_src,_ystride) \
+  ((*(_state)->opt_vtable.frag_copy)(_dst,_src,_ystride))
+#  endif
+#  if !defined(oc_frag_copy_list)
+#   define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+ ((*(_state)->opt_vtable.frag_copy_list)(_dst_frame,_src_frame,_ystride, \
+  _fragis,_nfragis,_frag_buf_offs))
+#  endif
+#  if !defined(oc_frag_recon_intra)
+#   define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
+  ((*(_state)->opt_vtable.frag_recon_intra)(_dst,_dst_ystride,_residue))
+#  endif
+#  if !defined(oc_frag_recon_inter)
+#   define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
+  ((*(_state)->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue))
+#  endif
+#  if !defined(oc_frag_recon_inter2)
+#   define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
+  ((*(_state)->opt_vtable.frag_recon_inter2)(_dst, \
+   _src1,_src2,_ystride,_residue))
+#  endif
+# if !defined(oc_idct8x8)
+#   define oc_idct8x8(_state,_y,_x,_last_zzi) \
+  ((*(_state)->opt_vtable.idct8x8)(_y,_x,_last_zzi))
+#  endif
+#  if !defined(oc_state_frag_recon)
+#   define oc_state_frag_recon(_state,_fragi, \
+ _pli,_dct_coeffs,_last_zzi,_dc_quant) \
+  ((*(_state)->opt_vtable.state_frag_recon)(_state,_fragi, \
+   _pli,_dct_coeffs,_last_zzi,_dc_quant))
+#  endif
+#  if !defined(oc_loop_filter_init)
+#   define oc_loop_filter_init(_state,_bv,_flimit) \
+  ((*(_state)->opt_vtable.loop_filter_init)(_bv,_flimit))
+#  endif
+#  if !defined(oc_state_loop_filter_frag_rows)
+#   define oc_state_loop_filter_frag_rows(_state, \
+ _bv,_refi,_pli,_fragy0,_fragy_end) \
+  ((*(_state)->opt_vtable.state_loop_filter_frag_rows)(_state, \
+   _bv,_refi,_pli,_fragy0,_fragy_end))
+#  endif
+#  if !defined(oc_restore_fpu)
+#   define oc_restore_fpu(_state) \
+  ((*(_state)->opt_vtable.restore_fpu)())
+#  endif
+# else
+#  if !defined(oc_frag_copy)
+#   define oc_frag_copy(_state,_dst,_src,_ystride) \
+  oc_frag_copy_c(_dst,_src,_ystride)
+#  endif
+#  if !defined(oc_frag_copy_list)
+#   define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+  oc_frag_copy_list_c(_dst_frame,_src_frame,_ystride, \
+  _fragis,_nfragis,_frag_buf_offs)
+#  endif
+#  if !defined(oc_frag_recon_intra)
+#   define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
+  oc_frag_recon_intra_c(_dst,_dst_ystride,_residue)
+#  endif
+#  if !defined(oc_frag_recon_inter)
+#   define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
+  oc_frag_recon_inter_c(_dst,_src,_ystride,_residue)
+#  endif
+#  if !defined(oc_frag_recon_inter2)
+#   define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
+  oc_frag_recon_inter2_c(_dst,_src1,_src2,_ystride,_residue)
+#  endif
+#  if !defined(oc_idct8x8)
+#   define oc_idct8x8(_state,_y,_x,_last_zzi) oc_idct8x8_c(_y,_x,_last_zzi)
+#  endif
+#  if !defined(oc_state_frag_recon)
+#   define oc_state_frag_recon oc_state_frag_recon_c
+#  endif
+#  if !defined(oc_loop_filter_init)
+#   define oc_loop_filter_init(_state,_bv,_flimit) \
+  oc_loop_filter_init_c(_bv,_flimit)
+#  endif
+#  if !defined(oc_state_loop_filter_frag_rows)
+#   define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c
+#  endif
+#  if !defined(oc_restore_fpu)
+#   define oc_restore_fpu(_state) do{}while(0)
+#  endif
+# endif
+
+
+
+/*A keyframe.*/
+# define OC_INTRA_FRAME (0)
+/*A predicted frame.*/
+# define OC_INTER_FRAME (1)
+/*A frame of unknown type (frame type decision has not yet been made).*/
+# define OC_UNKWN_FRAME (-1)
+
+/*The amount of padding to add to the reconstructed frame buffers on all
+   sides.
+  This is used to allow unrestricted motion vectors without special casing.
+  This must be a multiple of 2.*/
+# define OC_UMV_PADDING (16)
+
+/*Frame classification indices.*/
+/*The previous golden frame.*/
+# define OC_FRAME_GOLD      (0)
+/*The previous frame.*/
+# define OC_FRAME_PREV      (1)
+/*The current frame.*/
+# define OC_FRAME_SELF      (2)
+/*Used to mark uncoded fragments (for DC prediction).*/
+# define OC_FRAME_NONE      (3)
+
+/*The input or output buffer.*/
+# define OC_FRAME_IO        (3)
+/*Uncompressed prev golden frame.*/
+# define OC_FRAME_GOLD_ORIG (4)
+/*Uncompressed previous frame. */
+# define OC_FRAME_PREV_ORIG (5)
+
+/*Macroblock modes.*/
+/*Macro block is invalid: It is never coded.*/
+# define OC_MODE_INVALID        (-1)
+/*Encoded difference from the same macro block in the previous frame.*/
+# define OC_MODE_INTER_NOMV     (0)
+/*Encoded with no motion compensated prediction.*/
+# define OC_MODE_INTRA          (1)
+/*Encoded difference from the previous frame offset by the given motion
+   vector.*/
+# define OC_MODE_INTER_MV       (2)
+/*Encoded difference from the previous frame offset by the last coded motion
+   vector.*/
+# define OC_MODE_INTER_MV_LAST  (3)
+/*Encoded difference from the previous frame offset by the second to last
+   coded motion vector.*/
+# define OC_MODE_INTER_MV_LAST2 (4)
+/*Encoded difference from the same macro block in the previous golden
+   frame.*/
+# define OC_MODE_GOLDEN_NOMV    (5)
+/*Encoded difference from the previous golden frame offset by the given motion
+   vector.*/
+# define OC_MODE_GOLDEN_MV      (6)
+/*Encoded difference from the previous frame offset by the individual motion
+   vectors given for each block.*/
+# define OC_MODE_INTER_MV_FOUR  (7)
+/*The number of (coded) modes.*/
+# define OC_NMODES              (8)
+
+/*Determines the reference frame used for a given MB mode.*/
+# define OC_FRAME_FOR_MODE(_x) \
+ OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \
+  OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x))
+
+/*Constants for the packet state machine common between encoder and decoder.*/
+
+/*Next packet to emit/read: Codec info header.*/
+# define OC_PACKET_INFO_HDR    (-3)
+/*Next packet to emit/read: Comment header.*/
+# define OC_PACKET_COMMENT_HDR (-2)
+/*Next packet to emit/read: Codec setup header.*/
+# define OC_PACKET_SETUP_HDR   (-1)
+/*No more packets to emit/read.*/
+# define OC_PACKET_DONE        (INT_MAX)
+
+
+
+#define OC_MV(_x,_y)         ((oc_mv)((_x)&0xFF|(_y)<<8))
+#define OC_MV_X(_mv)         ((signed char)(_mv))
+#define OC_MV_Y(_mv)         ((_mv)>>8)
+#define OC_MV_ADD(_mv1,_mv2) \
+  OC_MV(OC_MV_X(_mv1)+OC_MV_X(_mv2), \
+   OC_MV_Y(_mv1)+OC_MV_Y(_mv2))
+#define OC_MV_SUB(_mv1,_mv2) \
+  OC_MV(OC_MV_X(_mv1)-OC_MV_X(_mv2), \
+   OC_MV_Y(_mv1)-OC_MV_Y(_mv2))
+
+
+
+/*Super blocks are 32x32 segments of pixels in a single color plane indexed
+   in image order.
+  Internally, super blocks are broken up into four quadrants, each of which
+   contains a 2x2 pattern of blocks, each of which is an 8x8 block of pixels.
+  Quadrants, and the blocks within them, are indexed in a special order called
+   a "Hilbert curve" within the super block.
+
+  In order to differentiate between the Hilbert-curve indexing strategy and
+   the regular image order indexing strategy, blocks indexed in image order
+   are called "fragments".
+  Fragments are indexed in image order, left to right, then bottom to top,
+   from Y' plane to Cb plane to Cr plane.
+
+  The co-located fragments in all image planes corresponding to the location
+   of a single quadrant of a luma plane super block form a macro block.
+  Thus there is only a single set of macro blocks for all planes, each of which
+   contains between 6 and 12 fragments, depending on the pixel format.
+  Therefore macro block information is kept in a separate set of arrays from
+   super blocks to avoid unused space in the other planes.
+  The lists are indexed in super block order.
+  That is, the macro block corresponding to the macro block mbi in (luma plane)
+   super block sbi is at index (sbi<<2|mbi).
+  Thus the number of macro blocks in each dimension is always twice the number
+   of super blocks, even when only an odd number fall inside the coded frame.
+  These "extra" macro blocks are just an artifact of our internal data layout,
+   and not part of the coded stream; they are flagged with a negative MB mode.*/
+
+
+
+/*Super block information.*/
+struct oc_sb_flags{
+  unsigned char coded_fully:1;
+  unsigned char coded_partially:1;
+  unsigned char quad_valid:4;
+};
+
+
+
+/*Information about a fragment which intersects the border of the displayable
+   region.
+  This marks which pixels belong to the displayable region.*/
+struct oc_border_info{
+  /*A bit mask marking which pixels are in the displayable region.
+    Pixel (x,y) corresponds to bit (y<<3|x).*/
+  ogg_int64_t mask;
+  /*The number of pixels in the displayable region.
+    This is always positive, and always less than 64.*/
+  int         npixels;
+};
+
+
+
+/*Fragment information.*/
+struct oc_fragment{
+  /*A flag indicating whether or not this fragment is coded.*/
+  unsigned   coded:1;
+  /*A flag indicating that this entire fragment lies outside the displayable
+     region of the frame.
+    Note the contrast with an invalid macro block, which is outside the coded
+     frame, not just the displayable one.
+    There are no fragments outside the coded frame by construction.*/
+  unsigned   invalid:1;
+  /*The index of the quality index used for this fragment's AC coefficients.*/
+  unsigned   qii:4;
+  /*The index of the reference frame this fragment is predicted from.*/
+  unsigned   refi:2;
+  /*The mode of the macroblock this fragment belongs to.*/
+  unsigned   mb_mode:3;
+  /*The index of the associated border information for fragments which lie
+     partially outside the displayable region.
+    For fragments completely inside or outside this region, this is -1.
+    Note that the C standard requires an explicit signed keyword for bitfield
+     types, since some compilers may treat them as unsigned without it.*/
+  signed int borderi:5;
+  /*The prediction-corrected DC component.
+    Note that the C standard requires an explicit signed keyword for bitfield
+     types, since some compilers may treat them as unsigned without it.*/
+  signed int dc:16;
+};
+
+
+
+/*A description of each fragment plane.*/
+struct oc_fragment_plane{
+  /*The number of fragments in the horizontal direction.*/
+  int       nhfrags;
+  /*The number of fragments in the vertical direction.*/
+  int       nvfrags;
+  /*The offset of the first fragment in the plane.*/
+  ptrdiff_t froffset;
+  /*The total number of fragments in the plane.*/
+  ptrdiff_t nfrags;
+  /*The number of super blocks in the horizontal direction.*/
+  unsigned  nhsbs;
+  /*The number of super blocks in the vertical direction.*/
+  unsigned  nvsbs;
+  /*The offset of the first super block in the plane.*/
+  unsigned  sboffset;
+  /*The total number of super blocks in the plane.*/
+  unsigned  nsbs;
+};
+
+
+typedef void (*oc_state_loop_filter_frag_rows_func)(
+ const oc_theora_state *_state,signed char _bv[256],int _refi,int _pli,
+ int _fragy0,int _fragy_end);
+
+/*The shared (encoder and decoder) functions that have accelerated variants.*/
+struct oc_base_opt_vtable{
+  void (*frag_copy)(unsigned char *_dst,
+   const unsigned char *_src,int _ystride);
+  void (*frag_copy_list)(unsigned char *_dst_frame,
+   const unsigned char *_src_frame,int _ystride,
+   const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+  void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
+   const ogg_int16_t _residue[64]);
+  void (*frag_recon_inter)(unsigned char *_dst,
+   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+  void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
+   const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
+  void (*idct8x8)(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+  void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
+   int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+  void (*loop_filter_init)(signed char _bv[256],int _flimit);
+  oc_state_loop_filter_frag_rows_func state_loop_filter_frag_rows;
+  void (*restore_fpu)(void);
+};
+
+/*The shared (encoder and decoder) tables that vary according to which variants
+   of the above functions are used.*/
+struct oc_base_opt_data{
+  const unsigned char *dct_fzig_zag;
+};
+
+
+/*State information common to both the encoder and decoder.*/
+struct oc_theora_state{
+  /*The stream information.*/
+  th_info             info;
+# if defined(OC_STATE_USE_VTABLE)
+  /*Table for shared accelerated functions.*/
+  oc_base_opt_vtable  opt_vtable;
+# endif
+  /*Table for shared data used by accelerated functions.*/
+  oc_base_opt_data    opt_data;
+  /*CPU flags to detect the presence of extended instruction sets.*/
+  ogg_uint32_t        cpu_flags;
+  /*The fragment plane descriptions.*/
+  oc_fragment_plane   fplanes[3];
+  /*The list of fragments, indexed in image order.*/
+  oc_fragment        *frags;
+  /*The the offset into the reference frame buffer to the upper-left pixel of
+     each fragment.*/
+  ptrdiff_t          *frag_buf_offs;
+  /*The motion vector for each fragment.*/
+  oc_mv              *frag_mvs;
+  /*The total number of fragments in a single frame.*/
+  ptrdiff_t           nfrags;
+  /*The list of super block maps, indexed in image order.*/
+  oc_sb_map          *sb_maps;
+  /*The list of super block flags, indexed in image order.*/
+  oc_sb_flags        *sb_flags;
+  /*The total number of super blocks in a single frame.*/
+  unsigned            nsbs;
+  /*The fragments from each color plane that belong to each macro block.
+    Fragments are stored in image order (left to right then top to bottom).
+    When chroma components are decimated, the extra fragments have an index of
+     -1.*/
+  oc_mb_map          *mb_maps;
+  /*The list of macro block modes.
+    A negative number indicates the macro block lies entirely outside the
+     coded frame.*/
+  signed char        *mb_modes;
+  /*The number of macro blocks in the X direction.*/
+  unsigned            nhmbs;
+  /*The number of macro blocks in the Y direction.*/
+  unsigned            nvmbs;
+  /*The total number of macro blocks.*/
+  size_t              nmbs;
+  /*The list of coded fragments, in coded order.
+    Uncoded fragments are stored in reverse order from the end of the list.*/
+  ptrdiff_t          *coded_fragis;
+  /*The number of coded fragments in each plane.*/
+  ptrdiff_t           ncoded_fragis[3];
+  /*The total number of coded fragments.*/
+  ptrdiff_t           ntotal_coded_fragis;
+  /*The actual buffers used for the reference frames.*/
+  th_ycbcr_buffer     ref_frame_bufs[6];
+  /*The index of the buffers being used for each OC_FRAME_* reference frame.*/
+  int                 ref_frame_idx[6];
+  /*The storage for the reference frame buffers.
+    This is just ref_frame_bufs[ref_frame_idx[i]][0].data, but is cached here
+     for faster look-up.*/
+  unsigned char      *ref_frame_data[6];
+  /*The handle used to allocate the reference frame buffers.*/
+  unsigned char      *ref_frame_handle;
+  /*The strides for each plane in the reference frames.*/
+  int                 ref_ystride[3];
+  /*The number of unique border patterns.*/
+  int                 nborders;
+  /*The unique border patterns for all border fragments.
+    The borderi field of fragments which straddle the border indexes this
+     list.*/
+  oc_border_info      borders[16];
+  /*The frame number of the last keyframe.*/
+  ogg_int64_t         keyframe_num;
+  /*The frame number of the current frame.*/
+  ogg_int64_t         curframe_num;
+  /*The granpos of the current frame.*/
+  ogg_int64_t         granpos;
+  /*The type of the current frame.*/
+  signed char         frame_type;
+  /*The bias to add to the frame count when computing granule positions.*/
+  unsigned char       granpos_bias;
+  /*The number of quality indices used in the current frame.*/
+  unsigned char       nqis;
+  /*The quality indices of the current frame.*/
+  unsigned char       qis[3];
+  /*The dequantization tables, stored in zig-zag order, and indexed by
+     qi, pli, qti, and zzi.*/
+  ogg_uint16_t       *dequant_tables[64][3][2];
+  OC_ALIGN16(oc_quant_table      dequant_table_data[64][3][2]);
+  /*Loop filter strength parameters.*/
+  unsigned char       loop_filter_limits[64];
+};
+
+
+
+/*The function type used to fill in the chroma plane motion vectors for a
+   macro block when 4 different motion vectors are specified in the luma
+   plane.
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lmbmv: The luma macro-block level motion vector to fill in for use in
+           prediction.
+  _lbmvs: The luma block-level motion vectors.*/
+typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]);
+
+
+
+/*A table of functions used to fill in the Cb,Cr plane motion vectors for a
+   macro block when 4 different motion vectors are specified in the luma
+   plane.*/
+extern const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS];
+
+
+
+int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
+void oc_state_clear(oc_theora_state *_state);
+void oc_state_accel_init_c(oc_theora_state *_state);
+void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli,
+ int _y0,int _yend);
+void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli);
+void oc_state_borders_fill(oc_theora_state *_state,int _refi);
+void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx,
+ th_ycbcr_buffer _img);
+int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
+int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
+ int _pli,oc_mv _mv);
+
+void oc_loop_filter_init_c(signed char _bv[256],int _flimit);
+void oc_state_loop_filter(oc_theora_state *_state,int _frame);
+# if defined(OC_DUMP_IMAGES)
+int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
+ const char *_suf);
+# endif
+
+/*Default pure-C implementations of shared accelerated functions.*/
+void oc_frag_copy_c(unsigned char *_dst,
+ const unsigned char *_src,int _src_ystride);
+void oc_frag_copy_list_c(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t _residue[64]);
+void oc_frag_recon_inter_c(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
+void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_restore_fpu_c(void);
+
+/*We need a way to call a few encoder functions without introducing a link-time
+   dependency into the decoder, while still allowing the old alpha API which
+   does not distinguish between encoder and decoder objects to be used.
+  We do this by placing a function table at the start of the encoder object
+   which can dispatch into the encoder library.
+  We do a similar thing for the decoder in case we ever decide to split off a
+   common base library.*/
+typedef void (*oc_state_clear_func)(theora_state *_th);
+typedef int (*oc_state_control_func)(theora_state *th,int _req,
+ void *_buf,size_t _buf_sz);
+typedef ogg_int64_t (*oc_state_granule_frame_func)(theora_state *_th,
+ ogg_int64_t _granulepos);
+typedef double (*oc_state_granule_time_func)(theora_state *_th,
+ ogg_int64_t _granulepos);
+
+
+struct oc_state_dispatch_vtable{
+  oc_state_clear_func         clear;
+  oc_state_control_func       control;
+  oc_state_granule_frame_func granule_frame;
+  oc_state_granule_time_func  granule_time;
+};
+
+#endif
--- a/media/libtheora/lib/x86/mmxfrag.c
+++ b/media/libtheora/lib/x86/mmxfrag.c
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: mmxfrag.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: mmxfrag.c 17410 2010-09-21 21:53:48Z tterribe $

 ********************************************************************/

@ -22,10 +22,64 @@
  The iteration each instruction belongs to is marked in the comments as #i.*/
 #include <stddef.h>
 #include "x86int.h"
-#include "mmxfrag.h"

 #if defined(OC_X86_ASM)

+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
+# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+  do{ \
+    const unsigned char *src; \
+    unsigned char       *dst; \
+    ptrdiff_t            ystride3; \
+    src=(_src); \
+    dst=(_dst); \
+    __asm__ __volatile__( \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*ystride3=ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[src],%[ystride],4),%[src]\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
+      :[ystride]"r"((ptrdiff_t)(_ystride)) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
   between rows.*/
 void oc_frag_copy_mmx(unsigned char *_dst,
@ -33,6 +87,27 @@ void oc_frag_copy_mmx(unsigned char *_dst,
  OC_FRAG_COPY_MMX(_dst,_src,_ystride);
 }

+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _dst_frame:     The reference frame to copy to.
+  _src_frame:     The reference frame to copy from.
+  _ystride:       The row stride of the reference frames.
+  _fragis:        A pointer to a list of fragment indices.
+  _nfragis:       The number of fragment indices to copy.
+  _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+    OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
+     _src_frame+frag_buf_off,_ystride);
+  }
+}
+
+
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
 const ogg_int16_t *_residue){
  __asm__ __volatile__(
--- a/media/libtheora/lib/x86/mmxfrag.h
+++ b/media/libtheora/lib/x86/mmxfrag.h
@ -1,64 +0,0 @@
-#if !defined(_x86_mmxfrag_H)
-# define _x86_mmxfrag_H (1)
-# include <stddef.h>
-# include "x86int.h"
-
-#if defined(OC_X86_ASM)
-
-/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
-   between rows.*/
-#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
-  do{ \
-    const unsigned char *src; \
-    unsigned char       *dst; \
-    ptrdiff_t            ystride3; \
-    src=(_src); \
-    dst=(_dst); \
-    __asm__ __volatile__( \
-      /*src+0*ystride*/ \
-      "movq (%[src]),%%mm0\n\t" \
-      /*src+1*ystride*/ \
-      "movq (%[src],%[ystride]),%%mm1\n\t" \
-      /*ystride3=ystride*3*/ \
-      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
-      /*src+2*ystride*/ \
-      "movq (%[src],%[ystride],2),%%mm2\n\t" \
-      /*src+3*ystride*/ \
-      "movq (%[src],%[ystride3]),%%mm3\n\t" \
-      /*dst+0*ystride*/ \
-      "movq %%mm0,(%[dst])\n\t" \
-      /*dst+1*ystride*/ \
-      "movq %%mm1,(%[dst],%[ystride])\n\t" \
-      /*Pointer to next 4.*/ \
-      "lea (%[src],%[ystride],4),%[src]\n\t" \
-      /*dst+2*ystride*/ \
-      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
-      /*dst+3*ystride*/ \
-      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
-      /*Pointer to next 4.*/ \
-      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
-      /*src+0*ystride*/ \
-      "movq (%[src]),%%mm0\n\t" \
-      /*src+1*ystride*/ \
-      "movq (%[src],%[ystride]),%%mm1\n\t" \
-      /*src+2*ystride*/ \
-      "movq (%[src],%[ystride],2),%%mm2\n\t" \
-      /*src+3*ystride*/ \
-      "movq (%[src],%[ystride3]),%%mm3\n\t" \
-      /*dst+0*ystride*/ \
-      "movq %%mm0,(%[dst])\n\t" \
-      /*dst+1*ystride*/ \
-      "movq %%mm1,(%[dst],%[ystride])\n\t" \
-      /*dst+2*ystride*/ \
-      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
-      /*dst+3*ystride*/ \
-      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
-      :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
-      :[ystride]"r"((ptrdiff_t)(_ystride)) \
-      :"memory" \
-    ); \
-  } \
-  while(0)
-
-# endif
-#endif
--- a/media/libtheora/lib/x86/mmxidct.c
+++ b/media/libtheora/lib/x86/mmxidct.c
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $

 ********************************************************************/

@ -30,89 +30,66 @@



-/*A table of constants used by the MMX routines.*/
-static const ogg_uint16_t __attribute__((aligned(8),used))
- OC_IDCT_CONSTS[(7+1)*4]={
-  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
-  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
-  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
-  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
-  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
-  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
-  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
-  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
-  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
-  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
-  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
-  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-      8,    8,    8,    8
-};
-
-/*Converts the expression in the argument to a string.*/
-#define OC_M2STR(_s) #_s
-
 /*38 cycles*/
-#define OC_IDCT_BEGIN \
+#define OC_IDCT_BEGIN(_y,_x) \
  "#OC_IDCT_BEGIN\n\t" \
-  "movq "OC_I(3)",%%mm2\n\t" \
-  "movq "OC_C(3)",%%mm6\n\t" \
+  "movq "OC_I(3,_x)",%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
  "movq %%mm2,%%mm4\n\t" \
-  "movq "OC_J(5)",%%mm7\n\t" \
+  "movq "OC_J(5,_x)",%%mm7\n\t" \
  "pmulhw %%mm6,%%mm4\n\t" \
-  "movq "OC_C(5)",%%mm1\n\t" \
+  "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
  "pmulhw %%mm7,%%mm6\n\t" \
  "movq %%mm1,%%mm5\n\t" \
  "pmulhw %%mm2,%%mm1\n\t" \
-  "movq "OC_I(1)",%%mm3\n\t" \
+  "movq "OC_I(1,_x)",%%mm3\n\t" \
  "pmulhw %%mm7,%%mm5\n\t" \
-  "movq "OC_C(1)",%%mm0\n\t" \
+  "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
  "paddw %%mm2,%%mm4\n\t" \
  "paddw %%mm7,%%mm6\n\t" \
  "paddw %%mm1,%%mm2\n\t" \
-  "movq "OC_J(7)",%%mm1\n\t" \
+  "movq "OC_J(7,_x)",%%mm1\n\t" \
  "paddw %%mm5,%%mm7\n\t" \
  "movq %%mm0,%%mm5\n\t" \
  "pmulhw %%mm3,%%mm0\n\t" \
  "paddw %%mm7,%%mm4\n\t" \
  "pmulhw %%mm1,%%mm5\n\t" \
-  "movq "OC_C(7)",%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
  "psubw %%mm2,%%mm6\n\t" \
  "paddw %%mm3,%%mm0\n\t" \
  "pmulhw %%mm7,%%mm3\n\t" \
-  "movq "OC_I(2)",%%mm2\n\t" \
+  "movq "OC_I(2,_x)",%%mm2\n\t" \
  "pmulhw %%mm1,%%mm7\n\t" \
  "paddw %%mm1,%%mm5\n\t" \
  "movq %%mm2,%%mm1\n\t" \
-  "pmulhw "OC_C(2)",%%mm2\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
  "psubw %%mm5,%%mm3\n\t" \
-  "movq "OC_J(6)",%%mm5\n\t" \
+  "movq "OC_J(6,_x)",%%mm5\n\t" \
  "paddw %%mm7,%%mm0\n\t" \
  "movq %%mm5,%%mm7\n\t" \
  "psubw %%mm4,%%mm0\n\t" \
-  "pmulhw "OC_C(2)",%%mm5\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
  "paddw %%mm1,%%mm2\n\t" \
-  "pmulhw "OC_C(6)",%%mm1\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
  "paddw %%mm4,%%mm4\n\t" \
  "paddw %%mm0,%%mm4\n\t" \
  "psubw %%mm6,%%mm3\n\t" \
  "paddw %%mm7,%%mm5\n\t" \
  "paddw %%mm6,%%mm6\n\t" \
-  "pmulhw "OC_C(6)",%%mm7\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
  "paddw %%mm3,%%mm6\n\t" \
-  "movq %%mm4,"OC_I(1)"\n\t" \
+  "movq %%mm4,"OC_I(1,_y)"\n\t" \
  "psubw %%mm5,%%mm1\n\t" \
-  "movq "OC_C(4)",%%mm4\n\t" \
+  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
  "movq %%mm3,%%mm5\n\t" \
  "pmulhw %%mm4,%%mm3\n\t" \
  "paddw %%mm2,%%mm7\n\t" \
-  "movq %%mm6,"OC_I(2)"\n\t" \
+  "movq %%mm6,"OC_I(2,_y)"\n\t" \
  "movq %%mm0,%%mm2\n\t" \
-  "movq "OC_I(0)",%%mm6\n\t" \
+  "movq "OC_I(0,_x)",%%mm6\n\t" \
  "pmulhw %%mm4,%%mm0\n\t" \
  "paddw %%mm3,%%mm5\n\t" \
-  "movq "OC_J(4)",%%mm3\n\t" \
+  "movq "OC_J(4,_x)",%%mm3\n\t" \
  "psubw %%mm1,%%mm5\n\t" \
  "paddw %%mm0,%%mm2\n\t" \
  "psubw %%mm3,%%mm6\n\t" \
@ -126,18 +103,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
  "paddw %%mm0,%%mm6\n\t" \
  "psubw %%mm2,%%mm6\n\t" \
  "paddw %%mm2,%%mm2\n\t" \
-  "movq "OC_I(1)",%%mm0\n\t" \
+  "movq "OC_I(1,_y)",%%mm0\n\t" \
  "paddw %%mm6,%%mm2\n\t" \
  "paddw %%mm3,%%mm4\n\t" \
  "psubw %%mm1,%%mm2\n\t" \
  "#end OC_IDCT_BEGIN\n\t" \

 /*38+8=46 cycles.*/
-#define OC_ROW_IDCT \
+#define OC_ROW_IDCT(_y,_x) \
  "#OC_ROW_IDCT\n" \
-  OC_IDCT_BEGIN \
+  OC_IDCT_BEGIN(_y,_x) \
  /*r3=D'*/ \
-  "movq "OC_I(2)",%%mm3\n\t" \
+  "movq "OC_I(2,_y)",%%mm3\n\t" \
  /*r4=E'=E-G*/ \
  "psubw %%mm7,%%mm4\n\t" \
  /*r1=H'+H'*/ \
@ -162,7 +139,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
  "psubw %%mm0,%%mm7\n\t" \
  "paddw %%mm0,%%mm0\n\t" \
  /*Save R1.*/ \
-  "movq %%mm1,"OC_I(1)"\n\t" \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
  /*r0=R0=G.+C.*/ \
  "paddw %%mm7,%%mm0\n\t" \
  "#end OC_ROW_IDCT\n\t" \
@ -195,11 +172,11 @@ static const ogg_uint16_t __attribute__((aligned(8),used))

  Since r1 is free at entry, we calculate the Js first.*/
 /*19 cycles.*/
-#define OC_TRANSPOSE \
+#define OC_TRANSPOSE(_y) \
  "#OC_TRANSPOSE\n\t" \
  "movq %%mm4,%%mm1\n\t" \
  "punpcklwd %%mm5,%%mm4\n\t" \
-  "movq %%mm0,"OC_I(0)"\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
  "punpckhwd %%mm5,%%mm1\n\t" \
  "movq %%mm6,%%mm0\n\t" \
  "punpcklwd %%mm7,%%mm6\n\t" \
@ -207,17 +184,17 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
  "punpckldq %%mm6,%%mm4\n\t" \
  "punpckhdq %%mm6,%%mm5\n\t" \
  "movq %%mm1,%%mm6\n\t" \
-  "movq %%mm4,"OC_J(4)"\n\t" \
+  "movq %%mm4,"OC_J(4,_y)"\n\t" \
  "punpckhwd %%mm7,%%mm0\n\t" \
-  "movq %%mm5,"OC_J(5)"\n\t" \
+  "movq %%mm5,"OC_J(5,_y)"\n\t" \
  "punpckhdq %%mm0,%%mm6\n\t" \
-  "movq "OC_I(0)",%%mm4\n\t" \
+  "movq "OC_I(0,_y)",%%mm4\n\t" \
  "punpckldq %%mm0,%%mm1\n\t" \
-  "movq "OC_I(1)",%%mm5\n\t" \
+  "movq "OC_I(1,_y)",%%mm5\n\t" \
  "movq %%mm4,%%mm0\n\t" \
-  "movq %%mm6,"OC_J(7)"\n\t" \
+  "movq %%mm6,"OC_J(7,_y)"\n\t" \
  "punpcklwd %%mm5,%%mm0\n\t" \
-  "movq %%mm1,"OC_J(6)"\n\t" \
+  "movq %%mm1,"OC_J(6,_y)"\n\t" \
  "punpckhwd %%mm5,%%mm4\n\t" \
  "movq %%mm2,%%mm5\n\t" \
  "punpcklwd %%mm3,%%mm2\n\t" \
@ -225,20 +202,20 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
  "punpckldq %%mm2,%%mm0\n\t" \
  "punpckhdq %%mm2,%%mm1\n\t" \
  "movq %%mm4,%%mm2\n\t" \
-  "movq %%mm0,"OC_I(0)"\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
  "punpckhwd %%mm3,%%mm5\n\t" \
-  "movq %%mm1,"OC_I(1)"\n\t" \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
  "punpckhdq %%mm5,%%mm4\n\t" \
  "punpckldq %%mm5,%%mm2\n\t" \
-  "movq %%mm4,"OC_I(3)"\n\t" \
-  "movq %%mm2,"OC_I(2)"\n\t" \
+  "movq %%mm4,"OC_I(3,_y)"\n\t" \
+  "movq %%mm2,"OC_I(2,_y)"\n\t" \
  "#end OC_TRANSPOSE\n\t" \

 /*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT \
+#define OC_COLUMN_IDCT(_y) \
  "#OC_COLUMN_IDCT\n" \
-  OC_IDCT_BEGIN \
-  "paddw "OC_8",%%mm2\n\t" \
+  OC_IDCT_BEGIN(_y,_y) \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
  /*r1=H'+H'*/ \
  "paddw %%mm1,%%mm1\n\t" \
  /*r1=R1=A''+H'*/ \
@ -250,18 +227,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
  /*r1=NR1*/ \
  "psraw $4,%%mm1\n\t" \
  /*r3=D'*/ \
-  "movq "OC_I(2)",%%mm3\n\t" \
+  "movq "OC_I(2,_y)",%%mm3\n\t" \
  /*r7=G+G*/ \
  "paddw %%mm7,%%mm7\n\t" \
  /*Store NR2 at I(2).*/ \
-  "movq %%mm2,"OC_I(2)"\n\t" \
+  "movq %%mm2,"OC_I(2,_y)"\n\t" \
  /*r7=G'=E+G*/ \
  "paddw %%mm4,%%mm7\n\t" \
  /*Store NR1 at I(1).*/ \
-  "movq %%mm1,"OC_I(1)"\n\t" \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
  /*r4=R4=E'-D'*/ \
  "psubw %%mm3,%%mm4\n\t" \
-  "paddw "OC_8",%%mm4\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
  /*r3=D'+D'*/ \
  "paddw %%mm3,%%mm3\n\t" \
  /*r3=R3=E'+D'*/ \
@ -272,7 +249,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
  "psubw %%mm5,%%mm6\n\t" \
  /*r3=NR3*/ \
  "psraw $4,%%mm3\n\t" \
-  "paddw "OC_8",%%mm6\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
  /*r5=B''+B''*/ \
  "paddw %%mm5,%%mm5\n\t" \
  /*r5=R5=F'+B''*/ \
@ -280,14 +257,14 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
  /*r6=NR6*/ \
  "psraw $4,%%mm6\n\t" \
  /*Store NR4 at J(4).*/ \
-  "movq %%mm4,"OC_J(4)"\n\t" \
+  "movq %%mm4,"OC_J(4,_y)"\n\t" \
  /*r5=NR5*/ \
  "psraw $4,%%mm5\n\t" \
  /*Store NR3 at I(3).*/ \
-  "movq %%mm3,"OC_I(3)"\n\t" \
+  "movq %%mm3,"OC_I(3,_y)"\n\t" \
  /*r7=R7=G'-C'*/ \
  "psubw %%mm0,%%mm7\n\t" \
-  "paddw "OC_8",%%mm7\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
  /*r0=C'+C'*/ \
  "paddw %%mm0,%%mm0\n\t" \
  /*r0=R0=G'+C'*/ \
@ -295,113 +272,123 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
  /*r7=NR7*/ \
  "psraw $4,%%mm7\n\t" \
  /*Store NR6 at J(6).*/ \
-  "movq %%mm6,"OC_J(6)"\n\t" \
+  "movq %%mm6,"OC_J(6,_y)"\n\t" \
  /*r0=NR0*/ \
  "psraw $4,%%mm0\n\t" \
  /*Store NR5 at J(5).*/ \
-  "movq %%mm5,"OC_J(5)"\n\t" \
+  "movq %%mm5,"OC_J(5,_y)"\n\t" \
  /*Store NR7 at J(7).*/ \
-  "movq %%mm7,"OC_J(7)"\n\t" \
+  "movq %%mm7,"OC_J(7,_y)"\n\t" \
  /*Store NR0 at I(0).*/ \
-  "movq %%mm0,"OC_I(0)"\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
  "#end OC_COLUMN_IDCT\n\t" \

-#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])"
-#define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
-#define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
-
-static void oc_idct8x8_slow(ogg_int16_t _y[64]){
+static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
  /*This routine accepts an 8x8 matrix, but in partially transposed form.
    Every 4x4 block is transposed.*/
  __asm__ __volatile__(
-#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+8)"(%[y])"
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+8,_y)
+    OC_ROW_IDCT(y,x)
+    OC_TRANSPOSE(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+64)"(%[y])"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+72)"(%[y])"
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+64,_y)
+#define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+72,_y)
+    OC_ROW_IDCT(y,x)
+    OC_TRANSPOSE(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+8)"(%[y])"
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+8,_y)
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(y)
 #undef  OC_I
 #undef  OC_J
-    :
-    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+    :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
  );
+  if(_x!=_y){
+    int i;
+    __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+    for(i=0;i<4;i++){
+      __asm__ __volatile__(
+        "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
+        :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
+      );
+    }
+  }
 }

 /*25 cycles.*/
-#define OC_IDCT_BEGIN_10 \
+#define OC_IDCT_BEGIN_10(_y,_x) \
 "#OC_IDCT_BEGIN_10\n\t" \
- "movq "OC_I(3)",%%mm2\n\t" \
+ "movq "OC_I(3,_x)",%%mm2\n\t" \
 "nop\n\t" \
- "movq "OC_C(3)",%%mm6\n\t" \
+ "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
 "movq %%mm2,%%mm4\n\t" \
- "movq "OC_C(5)",%%mm1\n\t" \
+ "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
 "pmulhw %%mm6,%%mm4\n\t" \
- "movq "OC_I(1)",%%mm3\n\t" \
+ "movq "OC_I(1,_x)",%%mm3\n\t" \
 "pmulhw %%mm2,%%mm1\n\t" \
- "movq "OC_C(1)",%%mm0\n\t" \
+ "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
 "paddw %%mm2,%%mm4\n\t" \
 "pxor %%mm6,%%mm6\n\t" \
 "paddw %%mm1,%%mm2\n\t" \
- "movq "OC_I(2)",%%mm5\n\t" \
+ "movq "OC_I(2,_x)",%%mm5\n\t" \
 "pmulhw %%mm3,%%mm0\n\t" \
 "movq %%mm5,%%mm1\n\t" \
 "paddw %%mm3,%%mm0\n\t" \
- "pmulhw "OC_C(7)",%%mm3\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
 "psubw %%mm2,%%mm6\n\t" \
- "pmulhw "OC_C(2)",%%mm5\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
 "psubw %%mm4,%%mm0\n\t" \
- "movq "OC_I(2)",%%mm7\n\t" \
+ "movq "OC_I(2,_x)",%%mm7\n\t" \
 "paddw %%mm4,%%mm4\n\t" \
 "paddw %%mm5,%%mm7\n\t" \
 "paddw %%mm0,%%mm4\n\t" \
- "pmulhw "OC_C(6)",%%mm1\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
 "psubw %%mm6,%%mm3\n\t" \
- "movq %%mm4,"OC_I(1)"\n\t" \
+ "movq %%mm4,"OC_I(1,_y)"\n\t" \
 "paddw %%mm6,%%mm6\n\t" \
- "movq "OC_C(4)",%%mm4\n\t" \
+ "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
 "paddw %%mm3,%%mm6\n\t" \
 "movq %%mm3,%%mm5\n\t" \
 "pmulhw %%mm4,%%mm3\n\t" \
- "movq %%mm6,"OC_I(2)"\n\t" \
+ "movq %%mm6,"OC_I(2,_y)"\n\t" \
 "movq %%mm0,%%mm2\n\t" \
- "movq "OC_I(0)",%%mm6\n\t" \
+ "movq "OC_I(0,_x)",%%mm6\n\t" \
 "pmulhw %%mm4,%%mm0\n\t" \
 "paddw %%mm3,%%mm5\n\t" \
 "paddw %%mm0,%%mm2\n\t" \
 "psubw %%mm1,%%mm5\n\t" \
 "pmulhw %%mm4,%%mm6\n\t" \
- "paddw "OC_I(0)",%%mm6\n\t" \
+ "paddw "OC_I(0,_x)",%%mm6\n\t" \
 "paddw %%mm1,%%mm1\n\t" \
 "movq %%mm6,%%mm4\n\t" \
 "paddw %%mm5,%%mm1\n\t" \
 "psubw %%mm2,%%mm6\n\t" \
 "paddw %%mm2,%%mm2\n\t" \
- "movq "OC_I(1)",%%mm0\n\t" \
+ "movq "OC_I(1,_y)",%%mm0\n\t" \
 "paddw %%mm6,%%mm2\n\t" \
 "psubw %%mm1,%%mm2\n\t" \
 "nop\n\t" \
 "#end OC_IDCT_BEGIN_10\n\t" \

 /*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 \
+#define OC_ROW_IDCT_10(_y,_x) \
 "#OC_ROW_IDCT_10\n\t" \
- OC_IDCT_BEGIN_10 \
+ OC_IDCT_BEGIN_10(_y,_x) \
 /*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
 /*r4=E'=E-G*/ \
 "psubw %%mm7,%%mm4\n\t" \
 /*r1=H'+H'*/ \
@ -426,16 +413,16 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
 "psubw %%mm0,%%mm7\n\t" \
 "paddw %%mm0,%%mm0\n\t" \
 /*Save R1.*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
 /*r0=R0=G'+C'*/ \
 "paddw %%mm7,%%mm0\n\t" \
 "#end OC_ROW_IDCT_10\n\t" \

 /*25+19=44 cycles'*/
-#define OC_COLUMN_IDCT_10 \
+#define OC_COLUMN_IDCT_10(_y) \
 "#OC_COLUMN_IDCT_10\n\t" \
- OC_IDCT_BEGIN_10 \
- "paddw "OC_8",%%mm2\n\t" \
+ OC_IDCT_BEGIN_10(_y,_y) \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
 /*r1=H'+H'*/ \
 "paddw %%mm1,%%mm1\n\t" \
 /*r1=R1=A''+H'*/ \
@ -447,18 +434,18 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
 /*r1=NR1*/ \
 "psraw $4,%%mm1\n\t" \
 /*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
 /*r7=G+G*/ \
 "paddw %%mm7,%%mm7\n\t" \
 /*Store NR2 at I(2).*/ \
- "movq %%mm2,"OC_I(2)"\n\t" \
+ "movq %%mm2,"OC_I(2,_y)"\n\t" \
 /*r7=G'=E+G*/ \
 "paddw %%mm4,%%mm7\n\t" \
 /*Store NR1 at I(1).*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
 /*r4=R4=E'-D'*/ \
 "psubw %%mm3,%%mm4\n\t" \
- "paddw "OC_8",%%mm4\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
 /*r3=D'+D'*/ \
 "paddw %%mm3,%%mm3\n\t" \
 /*r3=R3=E'+D'*/ \
@ -469,7 +456,7 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
 "psubw %%mm5,%%mm6\n\t" \
 /*r3=NR3*/ \
 "psraw $4,%%mm3\n\t" \
- "paddw "OC_8",%%mm6\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
 /*r5=B''+B''*/ \
 "paddw %%mm5,%%mm5\n\t" \
 /*r5=R5=F'+B''*/ \
@ -477,14 +464,14 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
 /*r6=NR6*/ \
 "psraw $4,%%mm6\n\t" \
 /*Store NR4 at J(4).*/ \
- "movq %%mm4,"OC_J(4)"\n\t" \
+ "movq %%mm4,"OC_J(4,_y)"\n\t" \
 /*r5=NR5*/ \
 "psraw $4,%%mm5\n\t" \
 /*Store NR3 at I(3).*/ \
- "movq %%mm3,"OC_I(3)"\n\t" \
+ "movq %%mm3,"OC_I(3,_y)"\n\t" \
 /*r7=R7=G'-C'*/ \
 "psubw %%mm0,%%mm7\n\t" \
- "paddw "OC_8",%%mm7\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
 /*r0=C'+C'*/ \
 "paddw %%mm0,%%mm0\n\t" \
 /*r0=R0=G'+C'*/ \
@ -492,46 +479,57 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
 /*r7=NR7*/ \
 "psraw $4,%%mm7\n\t" \
 /*Store NR6 at J(6).*/ \
- "movq %%mm6,"OC_J(6)"\n\t" \
+ "movq %%mm6,"OC_J(6,_y)"\n\t" \
 /*r0=NR0*/ \
 "psraw $4,%%mm0\n\t" \
 /*Store NR5 at J(5).*/ \
- "movq %%mm5,"OC_J(5)"\n\t" \
+ "movq %%mm5,"OC_J(5,_y)"\n\t" \
 /*Store NR7 at J(7).*/ \
- "movq %%mm7,"OC_J(7)"\n\t" \
+ "movq %%mm7,"OC_J(7,_y)"\n\t" \
 /*Store NR0 at I(0).*/ \
- "movq %%mm0,"OC_I(0)"\n\t" \
+ "movq %%mm0,"OC_I(0,_y)"\n\t" \
 "#end OC_COLUMN_IDCT_10\n\t" \

-static void oc_idct8x8_10(ogg_int16_t _y[64]){
+static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
  __asm__ __volatile__(
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
    /*Done with dequant, descramble, and partial transpose.
      Now do the iDCT itself.*/
-    OC_ROW_IDCT_10
-    OC_TRANSPOSE
+    OC_ROW_IDCT_10(y,x)
+    OC_TRANSPOSE(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(y)
 #undef  OC_I
 #undef  OC_J
-    :
-    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+    :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
  );
+  if(_x!=_y){
+    __asm__ __volatile__(
+      "pxor %%mm0,%%mm0\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+      :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
+    );
+  }
 }

 /*Performs an inverse 8x8 Type-II DCT transform.
  The input is assumed to be scaled by a factor of 4 relative to orthonormal
   version of the transform.*/
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
  /*_last_zzi is subtly different from an actual count of the number of
     coefficients we decoded for this block.
    It contains the value of zzi BEFORE the final token in the block was
@ -557,8 +555,8 @@ void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
     gets.
    Needless to say we inherited this approach from VP3.*/
  /*Then perform the iDCT.*/
-  if(_last_zzi<10)oc_idct8x8_10(_y);
-  else oc_idct8x8_slow(_y);
+  if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
+  else oc_idct8x8_slow_mmx(_y,_x);
 }

 #endif
--- a/media/libtheora/lib/x86/mmxloop.h
+++ b/media/libtheora/lib/x86/mmxloop.h
@ -9,88 +9,191 @@
  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
 #define OC_LOOP_FILTER8_MMX \
- "#OC_LOOP_FILTER8_MMX\n\t" \
- /*mm7=0*/ \
- "pxor %%mm7,%%mm7\n\t" \
- /*mm6:mm0={a0,...,a7}*/ \
- "movq %%mm0,%%mm6\n\t" \
- "punpcklbw %%mm7,%%mm0\n\t" \
- "punpckhbw %%mm7,%%mm6\n\t" \
- /*mm3:mm5={d0,...,d7}*/ \
- "movq %%mm3,%%mm5\n\t" \
- "punpcklbw %%mm7,%%mm3\n\t" \
- "punpckhbw %%mm7,%%mm5\n\t" \
- /*mm6:mm0={a0-d0,...,a7-d7}*/ \
- "psubw %%mm3,%%mm0\n\t" \
- "psubw %%mm5,%%mm6\n\t" \
- /*mm3:mm1={b0,...,b7}*/ \
- "movq %%mm1,%%mm3\n\t" \
- "punpcklbw %%mm7,%%mm1\n\t" \
- "movq %%mm2,%%mm4\n\t" \
- "punpckhbw %%mm7,%%mm3\n\t" \
- /*mm5:mm4={c0,...,c7}*/ \
- "movq %%mm2,%%mm5\n\t" \
- "punpcklbw %%mm7,%%mm4\n\t" \
- "punpckhbw %%mm7,%%mm5\n\t" \
- /*mm7={3}x4 \
-   mm5:mm4={c0-b0,...,c7-b7}*/ \
- "pcmpeqw %%mm7,%%mm7\n\t" \
- "psubw %%mm1,%%mm4\n\t" \
- "psrlw $14,%%mm7\n\t" \
- "psubw %%mm3,%%mm5\n\t" \
- /*Scale by 3.*/ \
- "pmullw %%mm7,%%mm4\n\t" \
- "pmullw %%mm7,%%mm5\n\t" \
- /*mm7={4}x4 \
-   mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
- "psrlw $1,%%mm7\n\t" \
- "paddw %%mm0,%%mm4\n\t" \
- "psllw $2,%%mm7\n\t" \
- "movq (%[ll]),%%mm0\n\t" \
- "paddw %%mm6,%%mm5\n\t" \
- /*R_i has the range [-127,128], so we compute -R_i instead. \
-   mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
- "psubw %%mm7,%%mm4\n\t" \
- "psubw %%mm7,%%mm5\n\t" \
- "psraw $3,%%mm4\n\t" \
- "psraw $3,%%mm5\n\t" \
- "pcmpeqb %%mm7,%%mm7\n\t" \
- "packsswb %%mm5,%%mm4\n\t" \
- "pxor %%mm6,%%mm6\n\t" \
- "pxor %%mm7,%%mm4\n\t" \
- "packuswb %%mm3,%%mm1\n\t" \
- /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
- /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
-    we have to split things by sign (the other option is to work in 16 bits, \
-    but working in 8 bits gives much better parallelism). \
-   We compute abs(R_i), but save a mask of which terms were negative in mm6. \
-   Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
-   Finally, we split mm4 into positive and negative pieces using the mask in \
-    mm6, and add and subtract them as appropriate.*/ \
- /*mm4=abs(-R_i)*/ \
- /*mm7=255-2*L*/ \
- "pcmpgtb %%mm4,%%mm6\n\t" \
- "psubb %%mm0,%%mm7\n\t" \
- "pxor %%mm6,%%mm4\n\t" \
- "psubb %%mm0,%%mm7\n\t" \
- "psubb %%mm6,%%mm4\n\t" \
- /*mm7=255-max(2*L-abs(R_i),0)*/ \
- "paddusb %%mm4,%%mm7\n\t" \
- /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
- "paddusb %%mm7,%%mm4\n\t" \
- "psubusb %%mm7,%%mm4\n\t" \
- /*Now split mm4 by the original sign of -R_i.*/ \
- "movq %%mm4,%%mm5\n\t" \
- "pand %%mm6,%%mm4\n\t" \
- "pandn %%mm5,%%mm6\n\t" \
- /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
- /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
- "paddusb %%mm4,%%mm1\n\t" \
- "psubusb %%mm4,%%mm2\n\t" \
- "psubusb %%mm6,%%mm1\n\t" \
- "paddusb %%mm6,%%mm2\n\t" \
+  "#OC_LOOP_FILTER8_MMX\n\t" \
+  /*mm7=0*/ \
+  "pxor %%mm7,%%mm7\n\t" \
+  /*mm6:mm0={a0,...,a7}*/ \
+  "movq %%mm0,%%mm6\n\t" \
+  "punpcklbw %%mm7,%%mm0\n\t" \
+  "punpckhbw %%mm7,%%mm6\n\t" \
+  /*mm3:mm5={d0,...,d7}*/ \
+  "movq %%mm3,%%mm5\n\t" \
+  "punpcklbw %%mm7,%%mm3\n\t" \
+  "punpckhbw %%mm7,%%mm5\n\t" \
+  /*mm6:mm0={a0-d0,...,a7-d7}*/ \
+  "psubw %%mm3,%%mm0\n\t" \
+  "psubw %%mm5,%%mm6\n\t" \
+  /*mm3:mm1={b0,...,b7}*/ \
+  "movq %%mm1,%%mm3\n\t" \
+  "punpcklbw %%mm7,%%mm1\n\t" \
+  "movq %%mm2,%%mm4\n\t" \
+  "punpckhbw %%mm7,%%mm3\n\t" \
+  /*mm5:mm4={c0,...,c7}*/ \
+  "movq %%mm2,%%mm5\n\t" \
+  "punpcklbw %%mm7,%%mm4\n\t" \
+  "punpckhbw %%mm7,%%mm5\n\t" \
+  /*mm7={3}x4 \
+    mm5:mm4={c0-b0,...,c7-b7}*/ \
+  "pcmpeqw %%mm7,%%mm7\n\t" \
+  "psubw %%mm1,%%mm4\n\t" \
+  "psrlw $14,%%mm7\n\t" \
+  "psubw %%mm3,%%mm5\n\t" \
+  /*Scale by 3.*/ \
+  "pmullw %%mm7,%%mm4\n\t" \
+  "pmullw %%mm7,%%mm5\n\t" \
+  /*mm7={4}x4 \
+    mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
+  "psrlw $1,%%mm7\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "psllw $2,%%mm7\n\t" \
+  "movq (%[ll]),%%mm0\n\t" \
+  "paddw %%mm6,%%mm5\n\t" \
+  /*R_i has the range [-127,128], so we compute -R_i instead. \
+    mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
+  "psubw %%mm7,%%mm4\n\t" \
+  "psubw %%mm7,%%mm5\n\t" \
+  "psraw $3,%%mm4\n\t" \
+  "psraw $3,%%mm5\n\t" \
+  "pcmpeqb %%mm7,%%mm7\n\t" \
+  "packsswb %%mm5,%%mm4\n\t" \
+  "pxor %%mm6,%%mm6\n\t" \
+  "pxor %%mm7,%%mm4\n\t" \
+  "packuswb %%mm3,%%mm1\n\t" \
+  /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
+  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+     we have to split things by sign (the other option is to work in 16 bits, \
+     but working in 8 bits gives much better parallelism). \
+    We compute abs(R_i), but save a mask of which terms were negative in mm6. \
+    Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
+    Finally, we split mm4 into positive and negative pieces using the mask in \
+     mm6, and add and subtract them as appropriate.*/ \
+  /*mm4=abs(-R_i)*/ \
+  /*mm7=255-2*L*/ \
+  "pcmpgtb %%mm4,%%mm6\n\t" \
+  "psubb %%mm0,%%mm7\n\t" \
+  "pxor %%mm6,%%mm4\n\t" \
+  "psubb %%mm0,%%mm7\n\t" \
+  "psubb %%mm6,%%mm4\n\t" \
+  /*mm7=255-max(2*L-abs(R_i),0)*/ \
+  "paddusb %%mm4,%%mm7\n\t" \
+  /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
+  "paddusb %%mm7,%%mm4\n\t" \
+  "psubusb %%mm7,%%mm4\n\t" \
+  /*Now split mm4 by the original sign of -R_i.*/ \
+  "movq %%mm4,%%mm5\n\t" \
+  "pand %%mm6,%%mm4\n\t" \
+  "pandn %%mm5,%%mm6\n\t" \
+  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+  "paddusb %%mm4,%%mm1\n\t" \
+  "psubusb %%mm4,%%mm2\n\t" \
+  "psubusb %%mm6,%%mm1\n\t" \
+  "paddusb %%mm6,%%mm2\n\t" \

-#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
+/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
+  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
+   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}.
+  All other MMX registers are clobbered.*/
+#define OC_LOOP_FILTER8_MMXEXT \
+  "#OC_LOOP_FILTER8_MMXEXT\n\t" \
+  /*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \
+     -R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \
+  /*This first part is based on the transformation \
+      f = -(3*(c-b)+a-d+4>>3) \
+        = -(3*(c+255-b)+(a+255-d)+4-1020>>3) \
+        = -(3*(c+~b)+(a+~d)-1016>>3) \
+        = 127-(3*(c+~b)+(a+~d)>>3) \
+        = 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \
+    Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \
+     fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \
+    Using this, the last expression above can be computed in 8 bits of working \
+     precision via: \
+      u = ~pavgb(~b,c); \
+      v = pavgb(b,~c); \
+      This mask is 0 or 0xFF, and controls whether t is biased up or down: \
+      m = u-v; \
+      t = m^pavgb(m^~a,m^d); \
+      f = 128+pavgb(pavgb(t,u),v); \
+    This required some careful analysis to ensure that carries are propagated \
+     correctly in all cases, but has been checked exhaustively.*/ \
+  /*input (a, b, c, d, ., ., ., .)*/ \
+  /*ff=0xFF; \
+    u=b; \
+    v=c; \
+    ll=255-2*L;*/ \
+  "pcmpeqb %%mm7,%%mm7\n\t" \
+  "movq %%mm1,%%mm4\n\t" \
+  "movq %%mm2,%%mm5\n\t" \
+  "movq (%[ll]),%%mm6\n\t" \
+  /*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \
+  /*u^=ff; \
+    v^=ff;*/ \
+  "pxor %%mm7,%%mm4\n\t" \
+  "pxor %%mm7,%%mm5\n\t" \
+  /*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \
+  /*u=pavgb(u,c); \
+    v=pavgb(v,b);*/ \
+  "pavgb %%mm2,%%mm4\n\t" \
+  "pavgb %%mm1,%%mm5\n\t" \
+  /*u^=ff; \
+    a^=ff;*/ \
+  "pxor %%mm7,%%mm4\n\t" \
+  "pxor %%mm7,%%mm0\n\t" \
+  /*m=u-v;*/ \
+  "psubb %%mm5,%%mm4\n\t" \
+  /*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \
+  /*a^=m; \
+    d^=m;*/ \
+  "pxor %%mm4,%%mm0\n\t" \
+  "pxor %%mm4,%%mm3\n\t" \
+  /*t=pavgb(a,d);*/ \
+  "pavgb %%mm3,%%mm0\n\t" \
+  "psllw $7,%%mm7\n\t" \
+  /*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \
+  /*t^=m; \
+    u=m+v;*/ \
+  "pxor %%mm4,%%mm0\n\t" \
+  "paddb %%mm5,%%mm4\n\t" \
+  /*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \
+  /*f=pavgb(f,u); \
+    of=128;*/ \
+  "pavgb %%mm4,%%mm0\n\t" \
+  "packsswb %%mm7,%%mm7\n\t" \
+  /*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \
+  /*f=pavgb(f,v);*/ \
+  "pavgb %%mm5,%%mm0\n\t" \
+  "movq %%mm7,%%mm3\n\t" \
+  "movq %%mm6,%%mm4\n\t" \
+  /*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \
+  /*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \
+  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+     we have to split things by sign (the other option is to work in 16 bits, \
+     but staying in 8 bits gives much better parallelism).*/ \
+  /*Instead of adding the offset of 128 in mm3, we use it to split mm0. \
+    This is the same number of instructions as computing a mask and splitting \
+     after the lflim computation, but has shorter dependency chains.*/ \
+  /*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\
+    mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \
+  "psubusb %%mm0,%%mm3\n\t" \
+  "psubusb %%mm7,%%mm0\n\t" \
+  /*mm6=255-max(2*L-abs(R_i<0),0) \
+    mm4=255-max(2*L-abs(R_i>0),0)*/ \
+  "paddusb %%mm3,%%mm4\n\t" \
+  "paddusb %%mm0,%%mm6\n\t" \
+  /*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \
+    mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \
+  "paddusb %%mm4,%%mm3\n\t" \
+  "paddusb %%mm6,%%mm0\n\t" \
+  "psubusb %%mm4,%%mm3\n\t" \
+  "psubusb %%mm6,%%mm0\n\t" \
+  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+  "paddusb %%mm3,%%mm1\n\t" \
+  "psubusb %%mm3,%%mm2\n\t" \
+  "psubusb %%mm0,%%mm1\n\t" \
+  "paddusb %%mm0,%%mm2\n\t" \
+
+#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \
  do{ \
    ptrdiff_t ystride3__; \
    __asm__ __volatile__( \
@ -104,7 +207,7 @@
      "movq (%[pix],%[ystride]),%%mm1\n\t" \
      /*mm2={c0,...,c7}*/ \
      "movq (%[pix],%[ystride],2),%%mm2\n\t" \
-      OC_LOOP_FILTER8_MMX \
+      _filter \
      /*Write it back out.*/ \
      "movq %%mm1,(%[pix],%[ystride])\n\t" \
      "movq %%mm2,(%[pix],%[ystride],2)\n\t" \
@ -116,7 +219,7 @@
  } \
  while(0)

-#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
+#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \
  do{ \
    unsigned char *pix__; \
    ptrdiff_t      ystride3__; \
@ -174,7 +277,7 @@
      "punpckldq %%mm5,%%mm2\n\t" \
      /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
      "punpckhdq %%mm5,%%mm3\n\t" \
-      OC_LOOP_FILTER8_MMX \
+      _filter \
      /*mm2={b0+R_0'',...,b7+R_7''}*/ \
      "movq %%mm1,%%mm0\n\t" \
      /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
--- a/media/libtheora/lib/x86/mmxstate.c
+++ b/media/libtheora/lib/x86/mmxstate.c
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: mmxstate.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $

 ********************************************************************/

@ -19,23 +19,23 @@
  Originally written by Rudolf Marek.*/
 #include <string.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 #include "mmxloop.h"

 #if defined(OC_X86_ASM)

 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
  unsigned char *dst;
  ptrdiff_t      frag_buf_off;
  int            ystride;
-  int            mb_mode;
+  int            refi;
  /*Apply the inverse transform.*/
  /*Special case only having a DC component.*/
  if(_last_zzi<2){
    /*Note that this value must be unsigned, to keep the __asm__ block from
       sign-extending it when it puts it in a register.*/
    ogg_uint16_t p;
+    int          i;
    /*We round this dequant product (and not any of the others) because there's
       no iDCT rounding.*/
    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
@ -47,81 +47,48 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
      "punpcklwd %%mm0,%%mm0\n\t"
      /*mm0=AAAA AAAA AAAA AAAA*/
      "punpckldq %%mm0,%%mm0\n\t"
-      "movq %%mm0,(%[y])\n\t"
-      "movq %%mm0,8(%[y])\n\t"
-      "movq %%mm0,16(%[y])\n\t"
-      "movq %%mm0,24(%[y])\n\t"
-      "movq %%mm0,32(%[y])\n\t"
-      "movq %%mm0,40(%[y])\n\t"
-      "movq %%mm0,48(%[y])\n\t"
-      "movq %%mm0,56(%[y])\n\t"
-      "movq %%mm0,64(%[y])\n\t"
-      "movq %%mm0,72(%[y])\n\t"
-      "movq %%mm0,80(%[y])\n\t"
-      "movq %%mm0,88(%[y])\n\t"
-      "movq %%mm0,96(%[y])\n\t"
-      "movq %%mm0,104(%[y])\n\t"
-      "movq %%mm0,112(%[y])\n\t"
-      "movq %%mm0,120(%[y])\n\t"
      :
-      :[y]"r"(_dct_coeffs),[p]"r"((unsigned)p)
-      :"memory"
+      :[p]"r"((unsigned)p)
    );
+    for(i=0;i<4;i++){
+      __asm__ __volatile__(
+        "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
+        :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
+      );
+    }
  }
  else{
    /*Dequantize the DC coefficient.*/
    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
+    oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
  }
  /*Fill in the target buffer.*/
  frag_buf_off=_state->frag_buf_offs[_fragi];
-  mb_mode=_state->frags[_fragi].mb_mode;
+  refi=_state->frags[_fragi].refi;
  ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
  else{
    const unsigned char *ref;
    int                  mvoffsets[2];
-    ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
-     +frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
-     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+     _state->frag_mvs[_fragi])>1){
      oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
-       _dct_coeffs);
+       _dct_coeffs+64);
    }
-    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
  }
 }

 /*We copy these entire function to inline the actual MMX routines so that we
   use only a single indirect call.*/

-/*Copies the fragments specified by the lists of fragment indices from one
-   frame to another.
-  _fragis:    A pointer to a list of fragment indices.
-  _nfragis:   The number of fragment indices to copy.
-  _dst_frame: The reference frame to copy to.
-  _src_frame: The reference frame to copy from.
-  _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  const ptrdiff_t     *frag_buf_offs;
-  const unsigned char *src_frame_data;
-  unsigned char       *dst_frame_data;
-  ptrdiff_t            fragii;
-  int                  ystride;
-  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
-  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_ystride[_pli];
-  frag_buf_offs=_state->frag_buf_offs;
-  for(fragii=0;fragii<_nfragis;fragii++){
-    ptrdiff_t frag_buf_off;
-    frag_buf_off=frag_buf_offs[_fragis[fragii]];
-    OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
-     src_frame_data+frag_buf_off,ystride);
-  }
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
+  memset(_bv,_flimit,8);
 }

 /*Apply the loop filter to a given set of fragment rows in the given plane.
@ -133,7 +100,7 @@ void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
  _fragy0:    The Y coordinate of the first fragment row to filter.
  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
  OC_ALIGN8(unsigned char   ll[8]);
  const oc_fragment_plane *fplane;
  const oc_fragment       *frags;
@ -170,13 +137,84 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
      if(frags[fragi].coded){
        unsigned char *ref;
        ref=ref_frame_data+frag_buf_offs[fragi];
-        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
-        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
+        if(fragi>fragi0){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
+        }
+        if(fragi0>fragi_top){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
+        }
        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
-          OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
        }
        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
-          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll);
+        }
+      }
+      fragi++;
+    }
+    fragi0+=nhfrags;
+  }
+}
+
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
+  memset(_bv,~(_flimit<<1),8);
+}
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  const ptrdiff_t         *frag_buf_offs;
+  unsigned char           *ref_frame_data;
+  ptrdiff_t                fragi_top;
+  ptrdiff_t                fragi_bot;
+  ptrdiff_t                fragi0;
+  ptrdiff_t                fragi0_end;
+  int                      ystride;
+  int                      nhfrags;
+  fplane=_state->fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  fragi_top=fplane->froffset;
+  fragi_bot=fragi_top+fplane->nfrags;
+  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
+  ystride=_state->ref_ystride[_pli];
+  frags=_state->frags;
+  frag_buf_offs=_state->frag_buf_offs;
+  ref_frame_data=_state->ref_frame_data[_refi];
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  while(fragi0<fragi0_end){
+    ptrdiff_t fragi;
+    ptrdiff_t fragi_end;
+    fragi=fragi0;
+    fragi_end=fragi+nhfrags;
+    while(fragi<fragi_end){
+      if(frags[fragi].coded){
+        unsigned char *ref;
+        ref=ref_frame_data+frag_buf_offs[fragi];
+        if(fragi>fragi0){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
+        }
+        if(fragi0>fragi_top){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
+        }
+        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
+        }
+        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
        }
      }
      fragi++;
--- a/media/libtheora/lib/x86/sse2idct.c
+++ b/media/libtheora/lib/x86/sse2idct.c
@ -0,0 +1,460 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*SSE2 acceleration of Theora's iDCT.*/
+#include "x86int.h"
+#include "sse2trans.h"
+#include "../dct.h"
+
+#if defined(OC_X86_ASM)
+
+/*A table of constants used by the MMX routines.*/
+const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
+        8,      8,      8,      8,      8,      8,      8,      8,
+  OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
+  OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
+  OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
+  OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
+  OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
+  OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
+  OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
+};
+
+
+/*Performs the first three stages of the iDCT.
+  xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
+   (accessed in that order).
+  The remaining rows must be in _x at their corresponding locations.
+  On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.*/
+#define OC_IDCT_8x8_ABC(_x) \
+  "#OC_IDCT_8x8_ABC\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
+  "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
+  "movdqa %%xmm1,%%xmm0\n\t" \
+  "pmulhw %%xmm2,%%xmm1\n\t" \
+  "movdqa %%xmm4,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm0\n\t" \
+  "pmulhw %%xmm2,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "paddw %%xmm6,%%xmm0\n\t" \
+  "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  "paddw %%xmm4,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
+  "movdqa %%xmm4,%%xmm2\n\t" \
+  "movdqa %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm3,%%xmm4\n\t" \
+  "pmulhw %%xmm5,%%xmm1\n\t" \
+  "pmulhw %%xmm3,%%xmm6\n\t" \
+  "pmulhw %%xmm5,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm3\n\t" \
+  "paddw %%xmm6,%%xmm3\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
+  "paddw %%xmm5,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
+  "paddw %%xmm3,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
+  "psubw %%xmm4,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
+  "movdqa %%xmm3,%%xmm0\n\t" \
+  "movdqa %%xmm4,%%xmm7\n\t" \
+  "pmulhw %%xmm5,%%xmm3\n\t" \
+  "pmulhw %%xmm5,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "pmulhw %%xmm6,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm4\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
+  "paddw %%xmm5,%%xmm7\n\t" \
+  "psubw %%xmm4,%%xmm3\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
+  /*0-1 butterfly. \
+    xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
+  "paddw %%xmm7,%%xmm6\n\t" \
+  "movdqa %%xmm4,%%xmm5\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "paddw %%xmm7,%%xmm7\n\t" \
+  "psubw %%xmm6,%%xmm7\n\t" \
+  "paddw %%xmm6,%%xmm4\n\t" \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
+    7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
+  "movdqa %%xmm3,%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm3\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "movdqa %%xmm5,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm5\n\t" \
+  "paddw %%xmm7,%%xmm5\n\t" \
+  "movdqa %%xmm0,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm0\n\t" \
+  "psubw %%xmm2,%%xmm7\n\t" \
+  "movdqa %%xmm1,%%xmm2\n\t" \
+  "pmulhw %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm2\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
+  "paddw %%xmm7,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
+    0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
+    1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
+  "paddw %%xmm2,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm5\n\t" \
+  "psubw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm7,%%xmm4\n\t" \
+  "psubw %%xmm6,%%xmm5\n\t" \
+
+/*Performs the last stage of the iDCT.
+  On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.
+  On output, xmm0 through xmm7 contain the corresponding rows.*/
+#define OC_IDCT_8x8_D \
+  "#OC_IDCT_8x8_D\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
+    1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
+    2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
+    3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "psubw %%xmm2,%%xmm5\n\t" \
+  "psubw %%xmm3,%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm0\n\t" \
+  "paddw %%xmm1,%%xmm1\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm3\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm3\n\t" \
+
+/*Performs the last stage of the iDCT.
+  On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.
+  On output, xmm0 through xmm7 contain the corresponding rows.*/
+#define OC_IDCT_8x8_D_STORE \
+  "#OC_IDCT_8x8_D_STORE\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
+    1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
+    2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
+    3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
+  "psubw %%xmm3,%%xmm4\n\t" \
+  "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "psubw %%xmm2,%%xmm5\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm4,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm5\n\t" \
+  "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm0\n\t" \
+  "paddw %%xmm1,%%xmm1\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm3\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "psraw $4,%%xmm0\n\t" \
+  "paddw %%xmm5,%%xmm2\n\t" \
+  "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
+  "psraw $4,%%xmm1\n\t" \
+  "paddw %%xmm4,%%xmm3\n\t" \
+  "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
+  "psraw $4,%%xmm2\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
+  "psraw $4,%%xmm3\n\t" \
+  "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
+  "psraw $4,%%xmm4\n\t" \
+  "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
+  "psraw $4,%%xmm5\n\t" \
+  "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
+  "psraw $4,%%xmm6\n\t" \
+  "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
+  "psraw $4,%%xmm7\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
+
+static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  /*This routine accepts an 8x8 matrix pre-transposed.*/
+  __asm__ __volatile__(
+    /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
+    "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
+    "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
+    "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
+    "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
+    OC_IDCT_8x8_ABC(x)
+    OC_IDCT_8x8_D
+    OC_TRANSPOSE_8x8
+    /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
+    "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
+    "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
+    "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
+    "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+    OC_IDCT_8x8_ABC(y)
+    OC_IDCT_8x8_D_STORE
+    :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
+     [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+    :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
+     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
+  );
+  if(_x!=_y){
+    int i;
+    __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
+    /*Clear input data for next block (decoder only).*/
+    for(i=0;i<2;i++){
+      __asm__ __volatile__(
+        "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+        "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+        "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+        "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+        :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
+      );
+    }
+  }
+}
+
+/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
+   need to work with four columns at a time.
+  Doing this in MMX is faster on processors with a 64-bit data path.*/
+#define OC_IDCT_8x8_10_MMX \
+  "#OC_IDCT_8x8_10_MMX\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
+  "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
+  "pmulhw %%mm2,%%mm6\n\t" \
+  "pmulhw %%mm2,%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
+  "paddw %%mm6,%%mm2\n\t" \
+  "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
+  "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
+  "pmulhw %%mm3,%%mm5\n\t" \
+  "pmulhw %%mm3,%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
+  "paddw %%mm3,%%mm5\n\t" \
+  "paddw %%mm3,%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
+  "pmulhw %%mm1,%%mm3\n\t" \
+  "pmulhw %%mm1,%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
+  "movq %%mm3,%%mm6\n\t" \
+  "paddw %%mm1,%%mm7\n\t" \
+  /*0-1 butterfly. \
+    mm4=C4, mm0=X0, X4=0.*/ \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: mm3=t[4], mm5=t[5] \
+    7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
+  "psubw %%mm5,%%mm3\n\t" \
+  "paddw %%mm5,%%mm6\n\t" \
+  "movq %%mm4,%%mm1\n\t" \
+  "pmulhw %%mm0,%%mm4\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "movq %%mm7,%%mm0\n\t" \
+  "movq %%mm4,%%mm5\n\t" \
+  "paddw %%mm2,%%mm0\n\t" \
+  "psubw %%mm2,%%mm7\n\t" \
+  "movq %%mm1,%%mm2\n\t" \
+  "pmulhw %%mm6,%%mm1\n\t" \
+  "pmulhw %%mm7,%%mm2\n\t" \
+  "paddw %%mm6,%%mm1\n\t" \
+  "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
+  "paddw %%mm7,%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
+    0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
+    1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
+  "paddw %%mm2,%%mm1\n\t" \
+  "paddw %%mm5,%%mm6\n\t" \
+  "paddw %%mm4,%%mm7\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "paddw %%mm4,%%mm4\n\t" \
+  "paddw %%mm5,%%mm5\n\t" \
+  "psubw %%mm1,%%mm2\n\t" \
+  "psubw %%mm7,%%mm4\n\t" \
+  "psubw %%mm6,%%mm5\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
+    1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
+    2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
+    3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
+  "psubw %%mm0,%%mm7\n\t" \
+  "psubw %%mm1,%%mm6\n\t" \
+  "psubw %%mm2,%%mm5\n\t" \
+  "psubw %%mm3,%%mm4\n\t" \
+  "paddw %%mm0,%%mm0\n\t" \
+  "paddw %%mm1,%%mm1\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "paddw %%mm3,%%mm3\n\t" \
+  "paddw %%mm7,%%mm0\n\t" \
+  "paddw %%mm6,%%mm1\n\t" \
+  "paddw %%mm5,%%mm2\n\t" \
+  "paddw %%mm4,%%mm3\n\t" \
+
+#define OC_IDCT_8x8_10_ABC \
+  "#OC_IDCT_8x8_10_ABC\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
+  "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
+  "pmulhw %%xmm2,%%xmm6\n\t" \
+  "pmulhw %%xmm2,%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
+  "paddw %%xmm6,%%xmm2\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
+  "pmulhw %%xmm3,%%xmm5\n\t" \
+  "pmulhw %%xmm3,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
+  "paddw %%xmm3,%%xmm5\n\t" \
+  "paddw %%xmm3,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
+  "pmulhw %%xmm1,%%xmm3\n\t" \
+  "pmulhw %%xmm1,%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
+  "movdqa %%xmm3,%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm7\n\t" \
+  /*0-1 butterfly. \
+    xmm4=C4, xmm0=X0, X4=0.*/ \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
+    7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
+  "psubw %%xmm5,%%xmm3\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "movdqa %%xmm4,%%xmm1\n\t" \
+  "pmulhw %%xmm0,%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm4\n\t" \
+  "movdqa %%xmm7,%%xmm0\n\t" \
+  "movdqa %%xmm4,%%xmm5\n\t" \
+  "paddw %%xmm2,%%xmm0\n\t" \
+  "psubw %%xmm2,%%xmm7\n\t" \
+  "movdqa %%xmm1,%%xmm2\n\t" \
+  "pmulhw %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm2\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
+  "paddw %%xmm7,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
+    0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
+    1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
+  "paddw %%xmm2,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm5\n\t" \
+  "psubw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm7,%%xmm4\n\t" \
+  "psubw %%xmm6,%%xmm5\n\t" \
+
+static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  /*This routine accepts an 8x8 matrix pre-transposed.*/
+  __asm__ __volatile__(
+    "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
+    OC_IDCT_8x8_10_MMX
+    OC_TRANSPOSE_8x4_MMX2SSE
+    OC_IDCT_8x8_10_ABC
+    OC_IDCT_8x8_D_STORE
+    :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
+     [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
+  );
+  if(_x!=_y){
+    /*Clear input data for next block (decoder only).*/
+    __asm__ __volatile__(
+      "pxor %%mm0,%%mm0\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+      :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
+    );
+  }
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.*/
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Then perform the iDCT.*/
+  if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
+  else oc_idct8x8_slow_sse2(_y,_x);
+}
+
+#endif
--- a/media/libtheora/lib/x86/sse2trans.h
+++ b/media/libtheora/lib/x86/sse2trans.h
@ -0,0 +1,242 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_sse2trans_H)
+# define _x86_sse2trans_H (1)
+# include "x86int.h"
+
+# if defined(OC_X86_64_ASM)
+/*On x86-64 we can transpose in-place without spilling registers.
+  By clever choices of the order to apply the butterflies and the order of
+   their outputs, we can take the rows in order and output the columns in order
+   without any extra operations and using just one temporary register.*/
+#  define OC_TRANSPOSE_8x8 \
+ "#OC_TRANSPOSE_8x8\n\t" \
+ "movdqa %%xmm4,%%xmm8\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm8\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm0,%%xmm5\n\t" \
+ /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm6,%%xmm1\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm1\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm2,%%xmm7\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm7\n\t" \
+ /*xmm3 is free.*/ \
+ "movdqa %%xmm0,%%xmm3\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm7,%%xmm0\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm7,%%xmm3\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm5,%%xmm7\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm7\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm4,%%xmm2\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm2\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm8,%%xmm6\n\t" \
+ /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm1,%%xmm6\n\t" \
+ /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm1,%%xmm8\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm2,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm6,%%xmm5\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm6,%%xmm4\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm7,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm8,%%xmm7\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm8,%%xmm6\n\t" \
+ /*xmm8 is free.*/ \
+
+# else
+/*Otherwise, we need to spill some values to %[buf] temporarily.
+  Again, the butterflies are carefully arranged to get the columns to come out
+   in order, minimizing register spills and maximizing the delay between a load
+   and when the value loaded is actually used.*/
+#  define OC_TRANSPOSE_8x8 \
+ "#OC_TRANSPOSE_8x8\n\t" \
+ /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*xmm0 is free.*/ \
+ "movdqa %%xmm2,%%xmm0\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm0\n\t" \
+ /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \
+ /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm6,%%xmm2\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm2\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm4,%%xmm7\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm7\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm3,%%xmm5\n\t" \
+ /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm3\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm7,%%xmm1\n\t" \
+ /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm2,%%xmm7\n\t" \
+ /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \
+ /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm3,%%xmm1\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm0,%%xmm3\n\t" \
+ /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm0,%%xmm1\n\t" \
+ /*xmm0 is free.*/ \
+ "movdqa %%xmm4,%%xmm0\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm0\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm5,%%xmm6\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm6\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm1,%%xmm2\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm0,%%xmm1\n\t" \
+ /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm0,%%xmm2\n\t" \
+ /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
+ /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm7,%%xmm5\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm7,%%xmm4\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm6,%%xmm7\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm0,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm0,%%xmm7\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \
+
+# endif
+
+/*Transpose 4 values in each of 8 MMX registers into 8 values in the first
+   four SSE registers.
+  No need to be clever here; we have plenty of room.*/
+#  define OC_TRANSPOSE_8x4_MMX2SSE \
+ "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
+ "movq2dq %%mm0,%%xmm0\n\t" \
+ "movq2dq %%mm1,%%xmm1\n\t" \
+ /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ "movq2dq %%mm2,%%xmm3\n\t" \
+ "movq2dq %%mm3,%%xmm2\n\t" \
+ /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm2,%%xmm3\n\t" \
+ "movq2dq %%mm4,%%xmm4\n\t" \
+ "movq2dq %%mm5,%%xmm5\n\t" \
+ /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ "movq2dq %%mm6,%%xmm7\n\t" \
+ "movq2dq %%mm7,%%xmm6\n\t" \
+ /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm6,%%xmm7\n\t" \
+ "movdqa %%xmm0,%%xmm2\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm3,%%xmm0\n\t" \
+ /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm3,%%xmm2\n\t" \
+ "movdqa %%xmm4,%%xmm5\n\t" \
+ /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm7,%%xmm4\n\t" \
+ /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm7,%%xmm5\n\t" \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm4,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm4,%%xmm1\n\t" \
+ "movdqa %%xmm2,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm5,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm5,%%xmm3\n\t" \
+
+#endif
--- a/media/libtheora/lib/x86/x86cpu.c
+++ b/media/libtheora/lib/x86/x86cpu.c
@ -0,0 +1,182 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for x86 processors.
+  Originally written by Rudolf Marek.
+
+ function:
+  last mod: $Id: x86cpu.c 17410 2010-09-21 21:53:48Z tterribe $
+
+ ********************************************************************/
+
+#include "x86cpu.h"
+
+#if !defined(OC_X86_ASM)
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+#else
+# if defined(__amd64__)||defined(__x86_64__)
+/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
+   compiling with -fPIC.*/
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "cpuid\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+# else
+/*On x86-32, not so much.*/
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   "cpuid\n\t" \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+# endif
+
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+  if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+  if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+  if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+  if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+  if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+  return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+  if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+  if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+  if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+  if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+  return flags;
+}
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  ogg_uint32_t eax;
+  ogg_uint32_t ebx;
+  ogg_uint32_t ecx;
+  ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+  /*Not all x86-32 chips support cpuid, so we have to check.*/
+  __asm__ __volatile__(
+   "pushfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "movl %[a],%[b]\n\t"
+   "xorl $0x200000,%[a]\n\t"
+   "pushl %[a]\n\t"
+   "popfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "popfl\n\t"
+   :[a]"=r"(eax),[b]"=r"(ebx)
+   :
+   :"cc"
+  );
+  /*No cpuid.*/
+  if(eax==ebx)return 0;
+# endif
+  cpuid(0,eax,ebx,ecx,edx);
+  /*         l e t n          I e n i          u n e G*/
+  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+   /*      6 8 x M          T e n i          u n e G*/
+   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+    int family;
+    int model;
+    /*Intel, Transmeta (tested with Crusoe TM5800):*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    family=(eax>>8)&0xF;
+    model=(eax>>4)&0xF;
+    /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
+       unit, so don't use it.*/
+    if(family==6&&(model==9||model==13||model==14)){
+      flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
+    }
+  }
+  /*              D M A c          i t n e          h t u A*/
+  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+   /*      C S N            y b   e          d o e G*/
+   ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
+    /*AMD, Geode:*/
+    cpuid(0x80000000,eax,ebx,ecx,edx);
+    if(eax<0x80000001)flags=0;
+    else{
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      flags=oc_parse_amd_flags(edx,ecx);
+    }
+    /*Also check for SSE.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags|=oc_parse_intel_flags(edx,ecx);
+  }
+  /*Technically some VIA chips can be configured in the BIOS to return any
+     string here the user wants.
+    There is a special detection method that can be used to identify such
+     processors, but in my opinion, if the user really wants to change it, they
+     deserve what they get.*/
+  /*              s l u a          H r u a          t n e C*/
+  else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+    /*VIA:*/
+    /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+       chips (thanks to the engineers from Centaur Technology who provided it).
+      These chips support Intel-like cpuid info.
+      The C3-2 (Nehemiah) cores appear to, as well.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    if(eax>=0x80000001){
+      /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+        We need to check this even if the Intel test succeeds to pick up 3DNow!
+         support on these processors.
+        Unlike actual AMD processors, we cannot _rely_ on this info, since
+         some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+         this function, yet return edx=0, despite the Intel test indicating
+         MMX support.
+        Therefore the features detected here are strictly added to those
+         detected by the Intel test.*/
+      /*TODO: How about earlier chips?*/
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      /*Note: As of the C7, this function returns Intel-style extended feature
+         flags, not AMD-style.
+        Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+         do not conflict with any of the AMD flags we inspect.
+        For the remaining bits, Intel tells us, "Do not count on their value",
+         but VIA assures us that they will all be zero (at least on the C7 and
+         Isaiah chips).
+        In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+         (0xC0C00000) for something else, we will have to add code to detect
+         the model to decide when it is appropriate to inspect them.*/
+      flags|=oc_parse_amd_flags(edx,ecx);
+    }
+  }
+  else{
+    /*Implement me.*/
+    flags=0;
+  }
+  return flags;
+}
+#endif
--- a/media/libtheora/lib/x86/x86cpu.h
+++ b/media/libtheora/lib/x86/x86cpu.h
@ -10,13 +10,13 @@
 *                                                                  *
 ********************************************************************
 function:
-    last mod: $Id: cpu.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: x86cpu.h 17410 2010-09-21 21:53:48Z tterribe $

 ********************************************************************/

-#if !defined(_x86_cpu_H)
-# define _x86_cpu_H (1)
-#include "internal.h"
+#if !defined(_x86_x86cpu_H)
+# define _x86_x86cpu_H (1)
+#include "../internal.h"

 #define OC_CPU_X86_MMX      (1<<0)
 #define OC_CPU_X86_3DNOW    (1<<1)
@ -31,4 +31,6 @@
 #define OC_CPU_X86_SSE4A    (1<<10)
 #define OC_CPU_X86_SSE5     (1<<11)

+ogg_uint32_t oc_cpu_flags_get(void);
+
 #endif
--- a/media/libtheora/lib/x86/x86int.h
+++ b/media/libtheora/lib/x86/x86int.h
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: x86int.h 17578 2010-10-29 04:21:26Z tterribe $

 ********************************************************************/

@ -19,24 +19,104 @@
 # define _x86_x86int_H (1)
 # include "../internal.h"

-void oc_state_vtable_init_x86(oc_theora_state *_state);
+# if defined(OC_X86_ASM)
+#  define oc_state_accel_init oc_state_accel_init_x86
+#  if defined(OC_X86_64_ASM)
+/*x86-64 guarantees SIMD support up through at least SSE2.
+  If the best routine we have available only needs SSE2 (which at the moment
+   covers all of them), then we can avoid runtime detection and the indirect
+   call.*/
+#   define oc_frag_copy(_state,_dst,_src,_ystride) \
+  oc_frag_copy_mmx(_dst,_src,_ystride)
+#   define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+  oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \
+   _fragis,_nfragis,_frag_buf_offs)
+#   define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \
+  oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
+#   define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
+  oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
+#   define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
+  oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue)
+#   define oc_idct8x8(_state,_y,_x,_last_zzi) \
+  oc_idct8x8_sse2(_y,_x,_last_zzi)
+#   define oc_state_frag_recon oc_state_frag_recon_mmx
+#   define oc_loop_filter_init(_state,_bv,_flimit) \
+  oc_loop_filter_init_mmxext(_bv,_flimit)
+#   define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext
+#   define oc_restore_fpu(_state) \
+  oc_restore_fpu_mmx()
+#  else
+#   define OC_STATE_USE_VTABLE (1)
+#  endif
+# endif
+
+# include "../state.h"
+# include "x86cpu.h"
+
+/*Converts the expression in the argument to a string.*/
+#define OC_M2STR(_s) #_s
+
+/*Memory operands do not always include an offset.
+  To avoid warnings, we force an offset with %H (which adds 8).*/
+# if __GNUC_PREREQ(4,0)
+#  define OC_MEM_OFFS(_offs,_name) \
+  OC_M2STR(_offs-8+%H[_name])
+# endif
+/*If your gcc version does't support %H, then you get to suffer the warnings.
+  Note that Apple's gas breaks on things like _offs+(%esp): it throws away the
+   whole offset, instead of substituting in 0 for the missing operand to +.*/
+# if !defined(OC_MEM_OFFS)
+#  define OC_MEM_OFFS(_offs,_name) \
+  OC_M2STR(_offs+%[_name])
+# endif
+
+/*Declare an array operand with an exact size.
+  This tells gcc we're going to clobber this memory region, without having to
+   clobber all of "memory" and lets us access local buffers directly using the
+   stack pointer, without allocating a separate register to point to them.*/
+#define OC_ARRAY_OPERAND(_type,_ptr,_size) \
+  (*({ \
+    struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \
+    array_addr__; \
+  }))
+
+/*Declare an array operand with an exact size.
+  This tells gcc we're going to clobber this memory region, without having to
+   clobber all of "memory" and lets us access local buffers directly using the
+   stack pointer, without allocating a separate register to point to them.*/
+#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \
+  (*({ \
+    const struct{_type array_value__[(_size)];} *array_addr__= \
+     (const void *)(_ptr); \
+    array_addr__; \
+  }))
+
+extern const unsigned short __attribute__((aligned(16))) OC_IDCT_CONSTS[64];
+
+void oc_state_accel_init_x86(oc_theora_state *_state);

 void oc_frag_copy_mmx(unsigned char *_dst,
 const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
 const ogg_int16_t *_residue);
 void oc_frag_recon_inter_mmx(unsigned char *_dst,
 const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
 const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit);
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_mmx(void);

 #endif
--- a/media/libtheora/lib/x86/x86state.c
+++ b/media/libtheora/lib/x86/x86state.c
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: x86state.c 17421 2010-09-22 16:46:18Z giles $

 ********************************************************************/

@ -19,8 +19,6 @@

 #if defined(OC_X86_ASM)

-#include "../cpu.c"
-
 /*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
   each quadrant of the destination.*/
 static const unsigned char OC_FZIG_ZAG_MMX[128]={
@ -39,24 +37,59 @@ static const unsigned char OC_FZIG_ZAG_MMX[128]={
  64,64,64,64,64,64,64,64,
  64,64,64,64,64,64,64,64,
  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
 };

-void oc_state_vtable_init_x86(oc_theora_state *_state){
+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
+   the destination.*/
+static const unsigned char OC_FZIG_ZAG_SSE2[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3, 4,11,18,25,32,40,
+  33,26,19,12, 5, 6,13,20,
+  27,34,41,48,56,49,42,35,
+  28,21,14, 7,15,22,29,36,
+  43,50,57,58,51,44,37,30,
+  23,31,38,45,52,59,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+
+void oc_state_accel_init_x86(oc_theora_state *_state){
+  oc_state_accel_init_c(_state);
  _state->cpu_flags=oc_cpu_flags_get();
+# if defined(OC_STATE_USE_VTABLE)
  if(_state->cpu_flags&OC_CPU_X86_MMX){
    _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
    _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
-    _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
    _state->opt_vtable.state_loop_filter_frag_rows=
     oc_state_loop_filter_frag_rows_mmx;
    _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
  }
-  else oc_state_vtable_init_c(_state);
+  if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmxext;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     oc_state_loop_filter_frag_rows_mmxext;
+  }
+  if(_state->cpu_flags&OC_CPU_X86_SSE2){
+    _state->opt_vtable.idct8x8=oc_idct8x8_sse2;
+# endif
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2;
+# if defined(OC_STATE_USE_VTABLE)
+  }
+# endif
 }
 #endif
--- a/media/libtheora/lib/x86_vc/mmxfrag.c
+++ b/media/libtheora/lib/x86_vc/mmxfrag.c
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: mmxfrag.c 16578 2009-09-25 19:50:48Z cristianadam $
+    last mod: $Id: mmxfrag.c 17446 2010-09-23 20:06:20Z tterribe $

 ********************************************************************/

@ -22,10 +22,61 @@
  The iteration each instruction belongs to is marked in the comments as #i.*/
 #include <stddef.h>
 #include "x86int.h"
-#include "mmxfrag.h"

 #if defined(OC_X86_ASM)

+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
+# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+  do{ \
+    const unsigned char *src; \
+    unsigned char       *dst; \
+    src=(_src); \
+    dst=(_dst); \
+    __asm  mov SRC,src \
+    __asm  mov DST,dst \
+    __asm  mov YSTRIDE,_ystride \
+    /*src+0*ystride*/ \
+    __asm  movq mm0,[SRC] \
+    /*src+1*ystride*/ \
+    __asm  movq mm1,[SRC+YSTRIDE] \
+    /*ystride3=ystride*3*/ \
+    __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
+    /*src+2*ystride*/ \
+    __asm  movq mm2,[SRC+YSTRIDE*2] \
+    /*src+3*ystride*/ \
+    __asm  movq mm3,[SRC+YSTRIDE3] \
+    /*dst+0*ystride*/ \
+    __asm  movq [DST],mm0 \
+    /*dst+1*ystride*/ \
+    __asm  movq [DST+YSTRIDE],mm1 \
+    /*Pointer to next 4.*/ \
+    __asm  lea SRC,[SRC+YSTRIDE*4] \
+    /*dst+2*ystride*/ \
+    __asm  movq [DST+YSTRIDE*2],mm2 \
+    /*dst+3*ystride*/ \
+    __asm  movq [DST+YSTRIDE3],mm3 \
+    /*Pointer to next 4.*/ \
+    __asm  lea DST,[DST+YSTRIDE*4] \
+    /*src+0*ystride*/ \
+    __asm  movq mm0,[SRC] \
+    /*src+1*ystride*/ \
+    __asm  movq mm1,[SRC+YSTRIDE] \
+    /*src+2*ystride*/ \
+    __asm  movq mm2,[SRC+YSTRIDE*2] \
+    /*src+3*ystride*/ \
+    __asm  movq mm3,[SRC+YSTRIDE3] \
+    /*dst+0*ystride*/ \
+    __asm  movq [DST],mm0 \
+    /*dst+1*ystride*/ \
+    __asm  movq [DST+YSTRIDE],mm1 \
+    /*dst+2*ystride*/ \
+    __asm  movq [DST+YSTRIDE*2],mm2 \
+    /*dst+3*ystride*/ \
+    __asm  movq [DST+YSTRIDE3],mm3 \
+  } \
+  while(0)
+
 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
   between rows.*/
 void oc_frag_copy_mmx(unsigned char *_dst,
@ -41,6 +92,34 @@ void oc_frag_copy_mmx(unsigned char *_dst,
 #undef YSTRIDE3
 }

+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _dst_frame:     The reference frame to copy to.
+  _src_frame:     The reference frame to copy from.
+  _ystride:       The row stride of the reference frames.
+  _fragis:        A pointer to a list of fragment indices.
+  _nfragis:       The number of fragment indices to copy.
+  _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+#define SRC edx
+#define DST eax
+#define YSTRIDE ecx
+#define YSTRIDE3 edi
+    OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
+     _src_frame+frag_buf_off,_ystride);
+#undef SRC
+#undef DST
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+}
+
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
 const ogg_int16_t *_residue){
  __asm{
--- a/media/libtheora/lib/x86_vc/mmxfrag.h
+++ b/media/libtheora/lib/x86_vc/mmxfrag.h
@ -1,61 +0,0 @@
-#if !defined(_x86_vc_mmxfrag_H)
-# define _x86_vc_mmxfrag_H (1)
-# include <stddef.h>
-# include "x86int.h"
-
-#if defined(OC_X86_ASM)
-
-/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
-   between rows.*/
-#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
-  do{ \
-    const unsigned char *src; \
-    unsigned char       *dst; \
-    src=(_src); \
-    dst=(_dst); \
-    __asm  mov SRC,src \
-    __asm  mov DST,dst \
-    __asm  mov YSTRIDE,_ystride \
-    /*src+0*ystride*/ \
-    __asm  movq mm0,[SRC] \
-    /*src+1*ystride*/ \
-    __asm  movq mm1,[SRC+YSTRIDE] \
-    /*ystride3=ystride*3*/ \
-    __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
-    /*src+2*ystride*/ \
-    __asm  movq mm2,[SRC+YSTRIDE*2] \
-    /*src+3*ystride*/ \
-    __asm  movq mm3,[SRC+YSTRIDE3] \
-    /*dst+0*ystride*/ \
-    __asm  movq [DST],mm0 \
-    /*dst+1*ystride*/ \
-    __asm  movq [DST+YSTRIDE],mm1 \
-    /*Pointer to next 4.*/ \
-    __asm  lea SRC,[SRC+YSTRIDE*4] \
-    /*dst+2*ystride*/ \
-    __asm  movq [DST+YSTRIDE*2],mm2 \
-    /*dst+3*ystride*/ \
-    __asm  movq [DST+YSTRIDE3],mm3 \
-    /*Pointer to next 4.*/ \
-    __asm  lea DST,[DST+YSTRIDE*4] \
-    /*src+0*ystride*/ \
-    __asm  movq mm0,[SRC] \
-    /*src+1*ystride*/ \
-    __asm  movq mm1,[SRC+YSTRIDE] \
-    /*src+2*ystride*/ \
-    __asm  movq mm2,[SRC+YSTRIDE*2] \
-    /*src+3*ystride*/ \
-    __asm  movq mm3,[SRC+YSTRIDE3] \
-    /*dst+0*ystride*/ \
-    __asm  movq [DST],mm0 \
-    /*dst+1*ystride*/ \
-    __asm  movq [DST+YSTRIDE],mm1 \
-    /*dst+2*ystride*/ \
-    __asm  movq [DST+YSTRIDE*2],mm2 \
-    /*dst+3*ystride*/ \
-    __asm  movq [DST+YSTRIDE3],mm3 \
-  } \
-  while(0)
-
-# endif
-#endif
--- a/media/libtheora/lib/x86_vc/mmxidct.c
+++ b/media/libtheora/lib/x86_vc/mmxidct.c
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $

 ********************************************************************/

@ -24,15 +24,15 @@

 /*These are offsets into the table of constants below.*/
 /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
-#define OC_COSINE_OFFSET (0)
+#define OC_COSINE_OFFSET (8)
 /*A row of 8's.*/
-#define OC_EIGHT_OFFSET  (56)
+#define OC_EIGHT_OFFSET  (0)



 /*A table of constants used by the MMX routines.*/
-static const __declspec(align(16))ogg_uint16_t
- OC_IDCT_CONSTS[(7+1)*4]={
+static const OC_ALIGN16(ogg_uint16_t) OC_IDCT_CONSTS[(1+7)*4]={
+      8,    8,    8,    8,
  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
@ -46,28 +46,27 @@ static const __declspec(align(16))ogg_uint16_t
  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-      8,    8,    8,    8
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1
 };

 /*38 cycles*/
-#define OC_IDCT_BEGIN __asm{ \
-  __asm movq mm2,OC_I(3) \
+#define OC_IDCT_BEGIN(_y,_x) __asm{ \
+  __asm movq mm2,OC_I(3,_x) \
  __asm movq mm6,OC_C(3) \
  __asm movq mm4,mm2 \
-  __asm movq mm7,OC_J(5) \
+  __asm movq mm7,OC_J(5,_x) \
  __asm pmulhw mm4,mm6 \
  __asm movq mm1,OC_C(5) \
  __asm pmulhw mm6,mm7 \
  __asm movq mm5,mm1 \
  __asm pmulhw mm1,mm2 \
-  __asm movq mm3,OC_I(1) \
+  __asm movq mm3,OC_I(1,_x) \
  __asm pmulhw mm5,mm7 \
  __asm movq mm0,OC_C(1) \
  __asm paddw mm4,mm2 \
  __asm paddw mm6,mm7 \
  __asm paddw mm2,mm1 \
-  __asm movq mm1,OC_J(7) \
+  __asm movq mm1,OC_J(7,_x) \
  __asm paddw mm7,mm5 \
  __asm movq mm5,mm0 \
  __asm pmulhw mm0,mm3 \
@ -77,13 +76,13 @@ static const __declspec(align(16))ogg_uint16_t
  __asm psubw mm6,mm2 \
  __asm paddw mm0,mm3 \
  __asm pmulhw mm3,mm7 \
-  __asm movq mm2,OC_I(2) \
+  __asm movq mm2,OC_I(2,_x) \
  __asm pmulhw mm7,mm1 \
  __asm paddw mm5,mm1 \
  __asm movq mm1,mm2 \
  __asm pmulhw mm2,OC_C(2) \
  __asm psubw mm3,mm5 \
-  __asm movq mm5,OC_J(6) \
+  __asm movq mm5,OC_J(6,_x) \
  __asm paddw mm0,mm7 \
  __asm movq mm7,mm5 \
  __asm psubw mm0,mm4 \
@ -97,18 +96,18 @@ static const __declspec(align(16))ogg_uint16_t
  __asm paddw mm6,mm6 \
  __asm pmulhw mm7,OC_C(6) \
  __asm paddw mm6,mm3 \
-  __asm movq OC_I(1),mm4 \
+  __asm movq OC_I(1,_y),mm4 \
  __asm psubw mm1,mm5 \
  __asm movq mm4,OC_C(4) \
  __asm movq mm5,mm3 \
  __asm pmulhw mm3,mm4 \
  __asm paddw mm7,mm2 \
-  __asm movq OC_I(2),mm6 \
+  __asm movq OC_I(2,_y),mm6 \
  __asm movq mm2,mm0 \
-  __asm movq mm6,OC_I(0) \
+  __asm movq mm6,OC_I(0,_x) \
  __asm pmulhw mm0,mm4 \
  __asm paddw mm5,mm3 \
-  __asm movq mm3,OC_J(4) \
+  __asm movq mm3,OC_J(4,_x) \
  __asm psubw mm5,mm1 \
  __asm paddw mm2,mm0 \
  __asm psubw mm6,mm3 \
@ -122,17 +121,17 @@ static const __declspec(align(16))ogg_uint16_t
  __asm paddw mm6,mm0 \
  __asm psubw mm6,mm2 \
  __asm paddw mm2,mm2 \
-  __asm movq mm0,OC_I(1) \
+  __asm movq mm0,OC_I(1,_y) \
  __asm paddw mm2,mm6 \
  __asm paddw mm4,mm3 \
  __asm psubw mm2,mm1 \
 }

 /*38+8=46 cycles.*/
-#define OC_ROW_IDCT __asm{ \
-  OC_IDCT_BEGIN \
+#define OC_ROW_IDCT(_y,_x) __asm{ \
+  OC_IDCT_BEGIN(_y,_x) \
  /*r3=D'*/ \
-  __asm  movq mm3,OC_I(2) \
+  __asm  movq mm3,OC_I(2,_y) \
  /*r4=E'=E-G*/ \
  __asm  psubw mm4,mm7 \
  /*r1=H'+H'*/ \
@ -157,7 +156,7 @@ static const __declspec(align(16))ogg_uint16_t
  __asm  psubw mm7,mm0 \
  __asm  paddw mm0,mm0 \
  /*Save R1.*/ \
-  __asm  movq OC_I(1),mm1 \
+  __asm  movq OC_I(1,_y),mm1 \
  /*r0=R0=G.+C.*/ \
  __asm  paddw mm0,mm7 \
 }
@ -190,10 +189,10 @@ static const __declspec(align(16))ogg_uint16_t

  Since r1 is free at entry, we calculate the Js first.*/
 /*19 cycles.*/
-#define OC_TRANSPOSE __asm{ \
+#define OC_TRANSPOSE(_y) __asm{ \
  __asm movq mm1,mm4 \
  __asm punpcklwd mm4,mm5 \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
  __asm punpckhwd mm1,mm5 \
  __asm movq mm0,mm6 \
  __asm punpcklwd mm6,mm7 \
@ -201,17 +200,17 @@ static const __declspec(align(16))ogg_uint16_t
  __asm punpckldq mm4,mm6 \
  __asm punpckhdq mm5,mm6 \
  __asm movq mm6,mm1 \
-  __asm movq OC_J(4),mm4 \
+  __asm movq OC_J(4,_y),mm4 \
  __asm punpckhwd mm0,mm7 \
-  __asm movq OC_J(5),mm5 \
+  __asm movq OC_J(5,_y),mm5 \
  __asm punpckhdq mm6,mm0 \
-  __asm movq mm4,OC_I(0) \
+  __asm movq mm4,OC_I(0,_y) \
  __asm punpckldq mm1,mm0 \
-  __asm movq mm5,OC_I(1) \
+  __asm movq mm5,OC_I(1,_y) \
  __asm movq mm0,mm4 \
-  __asm movq OC_J(7),mm6 \
+  __asm movq OC_J(7,_y),mm6 \
  __asm punpcklwd mm0,mm5 \
-  __asm movq OC_J(6),mm1 \
+  __asm movq OC_J(6,_y),mm1 \
  __asm punpckhwd mm4,mm5 \
  __asm movq mm5,mm2 \
  __asm punpcklwd mm2,mm3 \
@ -219,18 +218,18 @@ static const __declspec(align(16))ogg_uint16_t
  __asm punpckldq mm0,mm2 \
  __asm punpckhdq mm1,mm2 \
  __asm movq mm2,mm4 \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
  __asm punpckhwd mm5,mm3 \
-  __asm movq OC_I(1),mm1 \
+  __asm movq OC_I(1,_y),mm1 \
  __asm punpckhdq mm4,mm5 \
  __asm punpckldq mm2,mm5 \
-  __asm movq OC_I(3),mm4 \
-  __asm movq OC_I(2),mm2 \
+  __asm movq OC_I(3,_y),mm4 \
+  __asm movq OC_I(2,_y),mm2 \
 }

 /*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT __asm{ \
-  OC_IDCT_BEGIN \
+#define OC_COLUMN_IDCT(_y) __asm{ \
+  OC_IDCT_BEGIN(_y,_y) \
  __asm paddw mm2,OC_8 \
  /*r1=H'+H'*/ \
  __asm paddw mm1,mm1 \
@ -243,15 +242,15 @@ static const __declspec(align(16))ogg_uint16_t
  /*r1=NR1*/ \
  __asm psraw mm1,4 \
  /*r3=D'*/ \
-  __asm movq mm3,OC_I(2) \
+  __asm movq mm3,OC_I(2,_y) \
  /*r7=G+G*/ \
  __asm paddw mm7,mm7 \
  /*Store NR2 at I(2).*/ \
-  __asm movq OC_I(2),mm2 \
+  __asm movq OC_I(2,_y),mm2 \
  /*r7=G'=E+G*/ \
  __asm paddw mm7,mm4 \
  /*Store NR1 at I(1).*/ \
-  __asm movq OC_I(1),mm1 \
+  __asm movq OC_I(1,_y),mm1 \
  /*r4=R4=E'-D'*/ \
  __asm psubw mm4,mm3 \
  __asm paddw mm4,OC_8 \
@ -273,11 +272,11 @@ static const __declspec(align(16))ogg_uint16_t
  /*r6=NR6*/ \
  __asm psraw mm6,4 \
  /*Store NR4 at J(4).*/ \
-  __asm movq OC_J(4),mm4 \
+  __asm movq OC_J(4,_y),mm4 \
  /*r5=NR5*/ \
  __asm psraw mm5,4 \
  /*Store NR3 at I(3).*/ \
-  __asm movq OC_I(3),mm3 \
+  __asm movq OC_I(3,_y),mm3 \
  /*r7=R7=G'-C'*/ \
  __asm psubw mm7,mm0 \
  __asm paddw mm7,OC_8 \
@ -288,71 +287,92 @@ static const __declspec(align(16))ogg_uint16_t
  /*r7=NR7*/ \
  __asm psraw mm7,4 \
  /*Store NR6 at J(6).*/ \
-  __asm movq OC_J(6),mm6 \
+  __asm movq OC_J(6,_y),mm6 \
  /*r0=NR0*/ \
  __asm psraw mm0,4 \
  /*Store NR5 at J(5).*/ \
-  __asm movq OC_J(5),mm5 \
+  __asm movq OC_J(5,_y),mm5 \
  /*Store NR7 at J(7).*/ \
-  __asm movq OC_J(7),mm7 \
+  __asm movq OC_J(7,_y),mm7 \
  /*Store NR0 at I(0).*/ \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
 }

 #define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]
 #define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
 #define OC_8          OC_MID(OC_EIGHT_OFFSET,0)

-static void oc_idct8x8_slow(ogg_int16_t _y[64]){
+static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  int i;
  /*This routine accepts an 8x8 matrix, but in partially transposed form.
    Every 4x4 block is transposed.*/
  __asm{
 #define CONSTS eax
 #define Y edx
+#define X ecx
    mov CONSTS,offset OC_IDCT_CONSTS
    mov Y,_y
-#define OC_I(_k)      [Y+_k*16]
-#define OC_J(_k)      [Y+(_k-4)*16+8]
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+    mov X,_x
+#define OC_I(_k,_y)   [(_y)+(_k)*16]
+#define OC_J(_k,_y)   [(_y)+((_k)-4)*16+8]
+    OC_ROW_IDCT(Y,X)
+    OC_TRANSPOSE(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      [Y+(_k*16)+64]
-#define OC_J(_k)      [Y+(_k-4)*16+72]
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+#define OC_I(_k,_y)   [(_y)+(_k)*16+64]
+#define OC_J(_k,_y)   [(_y)+((_k)-4)*16+72]
+    OC_ROW_IDCT(Y,X)
+    OC_TRANSPOSE(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      [Y+_k*16]
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   [(_y)+(_k)*16]
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      [Y+_k*16+8]
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   [(_y)+(_k)*16+8]
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(Y)
 #undef  OC_I
 #undef  OC_J
 #undef  CONSTS
 #undef  Y
+#undef  X
+  }
+  if(_x!=_y){
+    int i;
+    __asm pxor mm0,mm0;
+    for(i=0;i<4;i++){
+      ogg_int16_t *x;
+      x=_x+16*i;
+#define X ecx
+      __asm{
+        mov X,x
+        movq [X+0x00],mm0
+        movq [X+0x08],mm0
+        movq [X+0x10],mm0
+        movq [X+0x18],mm0
+      }
+#undef  X
+    }
  }
 }

 /*25 cycles.*/
-#define OC_IDCT_BEGIN_10 __asm{ \
-  __asm movq mm2,OC_I(3) \
+#define OC_IDCT_BEGIN_10(_y,_x) __asm{ \
+  __asm movq mm2,OC_I(3,_x) \
  __asm nop \
  __asm movq mm6,OC_C(3) \
  __asm movq mm4,mm2 \
  __asm movq mm1,OC_C(5) \
  __asm pmulhw mm4,mm6 \
-  __asm movq mm3,OC_I(1) \
+  __asm movq mm3,OC_I(1,_x) \
  __asm pmulhw mm1,mm2 \
  __asm movq mm0,OC_C(1) \
  __asm paddw mm4,mm2 \
  __asm pxor mm6,mm6 \
  __asm paddw mm2,mm1 \
-  __asm movq mm5,OC_I(2) \
+  __asm movq mm5,OC_I(2,_x) \
  __asm pmulhw mm0,mm3 \
  __asm movq mm1,mm5 \
  __asm paddw mm0,mm3 \
@ -360,43 +380,43 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  __asm psubw mm6,mm2 \
  __asm pmulhw mm5,OC_C(2) \
  __asm psubw mm0,mm4 \
-  __asm movq mm7,OC_I(2) \
+  __asm movq mm7,OC_I(2,_x) \
  __asm paddw mm4,mm4 \
  __asm paddw mm7,mm5 \
  __asm paddw mm4,mm0 \
  __asm pmulhw mm1,OC_C(6) \
  __asm psubw mm3,mm6 \
-  __asm movq OC_I(1),mm4 \
+  __asm movq OC_I(1,_y),mm4 \
  __asm paddw mm6,mm6 \
  __asm movq mm4,OC_C(4) \
  __asm paddw mm6,mm3 \
  __asm movq mm5,mm3 \
  __asm pmulhw mm3,mm4 \
-  __asm movq OC_I(2),mm6 \
+  __asm movq OC_I(2,_y),mm6 \
  __asm movq mm2,mm0 \
-  __asm movq mm6,OC_I(0) \
+  __asm movq mm6,OC_I(0,_x) \
  __asm pmulhw mm0,mm4 \
  __asm paddw mm5,mm3 \
  __asm paddw mm2,mm0 \
  __asm psubw mm5,mm1 \
  __asm pmulhw mm6,mm4 \
-  __asm paddw mm6,OC_I(0) \
+  __asm paddw mm6,OC_I(0,_x) \
  __asm paddw mm1,mm1 \
  __asm movq mm4,mm6 \
  __asm paddw mm1,mm5 \
  __asm psubw mm6,mm2 \
  __asm paddw mm2,mm2 \
-  __asm movq mm0,OC_I(1) \
+  __asm movq mm0,OC_I(1,_y) \
  __asm paddw mm2,mm6 \
  __asm psubw mm2,mm1 \
  __asm nop \
 }

 /*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 __asm{ \
-  OC_IDCT_BEGIN_10 \
+#define OC_ROW_IDCT_10(_y,_x) __asm{ \
+  OC_IDCT_BEGIN_10(_y,_x) \
  /*r3=D'*/ \
-   __asm movq mm3,OC_I(2) \
+   __asm movq mm3,OC_I(2,_y) \
  /*r4=E'=E-G*/ \
   __asm psubw mm4,mm7 \
  /*r1=H'+H'*/ \
@ -421,14 +441,14 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
   __asm psubw mm7,mm0 \
   __asm paddw mm0,mm0 \
  /*Save R1.*/ \
-   __asm movq OC_I(1),mm1 \
+   __asm movq OC_I(1,_y),mm1 \
  /*r0=R0=G'+C'*/ \
   __asm paddw mm0,mm7 \
 }

 /*25+19=44 cycles'*/
-#define OC_COLUMN_IDCT_10 __asm{ \
-  OC_IDCT_BEGIN_10 \
+#define OC_COLUMN_IDCT_10(_y) __asm{ \
+  OC_IDCT_BEGIN_10(_y,_y) \
  __asm paddw mm2,OC_8 \
  /*r1=H'+H'*/ \
  __asm paddw mm1,mm1 \
@ -441,15 +461,15 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  /*r1=NR1*/ \
  __asm psraw mm1,4 \
  /*r3=D'*/ \
-  __asm movq mm3,OC_I(2) \
+  __asm movq mm3,OC_I(2,_y) \
  /*r7=G+G*/ \
  __asm paddw mm7,mm7 \
  /*Store NR2 at I(2).*/ \
-  __asm movq OC_I(2),mm2 \
+  __asm movq OC_I(2,_y),mm2 \
  /*r7=G'=E+G*/ \
  __asm paddw mm7,mm4 \
  /*Store NR1 at I(1).*/ \
-  __asm movq OC_I(1),mm1 \
+  __asm movq OC_I(1,_y),mm1 \
  /*r4=R4=E'-D'*/ \
  __asm psubw mm4,mm3 \
  __asm paddw mm4,OC_8 \
@ -471,11 +491,11 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  /*r6=NR6*/ \
  __asm psraw mm6,4 \
  /*Store NR4 at J(4).*/ \
-  __asm movq OC_J(4),mm4 \
+  __asm movq OC_J(4,_y),mm4 \
  /*r5=NR5*/ \
  __asm psraw mm5,4 \
  /*Store NR3 at I(3).*/ \
-  __asm movq OC_I(3),mm3 \
+  __asm movq OC_I(3,_y),mm3 \
  /*r7=R7=G'-C'*/ \
  __asm psubw mm7,mm0 \
  __asm paddw mm7,OC_8 \
@ -486,50 +506,65 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  /*r7=NR7*/ \
  __asm psraw mm7,4 \
  /*Store NR6 at J(6).*/ \
-  __asm movq OC_J(6),mm6 \
+  __asm movq OC_J(6,_y),mm6 \
  /*r0=NR0*/ \
  __asm psraw mm0,4 \
  /*Store NR5 at J(5).*/ \
-  __asm movq OC_J(5),mm5 \
+  __asm movq OC_J(5,_y),mm5 \
  /*Store NR7 at J(7).*/ \
-  __asm movq OC_J(7),mm7 \
+  __asm movq OC_J(7,_y),mm7 \
  /*Store NR0 at I(0).*/ \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
 }

-static void oc_idct8x8_10(ogg_int16_t _y[64]){
+static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
  __asm{
 #define CONSTS eax
 #define Y edx
+#define X ecx
    mov CONSTS,offset OC_IDCT_CONSTS
    mov Y,_y
-#define OC_I(_k) [Y+_k*16]
-#define OC_J(_k) [Y+(_k-4)*16+8]
+    mov X,_x
+#define OC_I(_k,_y) [(_y)+(_k)*16]
+#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]
    /*Done with dequant, descramble, and partial transpose.
      Now do the iDCT itself.*/
-    OC_ROW_IDCT_10
-    OC_TRANSPOSE
+    OC_ROW_IDCT_10(Y,X)
+    OC_TRANSPOSE(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) [Y+_k*16]
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) [(_y)+(_k)*16]
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) [Y+_k*16+8]
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) [(_y)+(_k)*16+8]
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(Y)
 #undef  OC_I
 #undef  OC_J
 #undef  CONSTS
 #undef  Y
+#undef  X
+  }
+  if(_x!=_y){
+#define X ecx
+    __asm{
+      pxor mm0,mm0;
+      mov X,_x
+      movq [X+0x00],mm0
+      movq [X+0x10],mm0
+      movq [X+0x20],mm0
+      movq [X+0x30],mm0
+    }
+#undef  X
  }
 }

 /*Performs an inverse 8x8 Type-II DCT transform.
  The input is assumed to be scaled by a factor of 4 relative to orthonormal
   version of the transform.*/
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
  /*_last_zzi is subtly different from an actual count of the number of
     coefficients we decoded for this block.
    It contains the value of zzi BEFORE the final token in the block was
@ -555,8 +590,8 @@ void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
     gets.
    Needless to say we inherited this approach from VP3.*/
  /*Perform the iDCT.*/
-  if(_last_zzi<10)oc_idct8x8_10(_y);
-  else oc_idct8x8_slow(_y);
+  if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
+  else oc_idct8x8_slow(_y,_x);
 }

 #endif
--- a/media/libtheora/lib/x86_vc/mmxstate.c
+++ b/media/libtheora/lib/x86_vc/mmxstate.c
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: mmxstate.c 16584 2009-09-26 19:35:55Z tterribe $
+    last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $

 ********************************************************************/

@ -19,17 +19,16 @@
  Originally written by Rudolf Marek.*/
 #include <string.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 #include "mmxloop.h"

 #if defined(OC_X86_ASM)

 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
  unsigned char *dst;
  ptrdiff_t      frag_buf_off;
  int            ystride;
-  int            mb_mode;
+  int            refi;
  /*Apply the inverse transform.*/
  /*Special case only having a DC component.*/
  if(_last_zzi<2){
@ -45,6 +44,7 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
 #define P ecx
      mov Y,_dct_coeffs
      movzx P,p
+      lea Y,[Y+128]
      /*mm0=0000 0000 0000 AAAA*/
      movd mm0,P
      /*mm0=0000 0000 AAAA AAAA*/
@ -74,65 +74,32 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  else{
    /*Dequantize the DC coefficient.*/
    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
+    oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi);
  }
  /*Fill in the target buffer.*/
  frag_buf_off=_state->frag_buf_offs[_fragi];
-  mb_mode=_state->frags[_fragi].mb_mode;
+  refi=_state->frags[_fragi].refi;
  ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
  else{
    const unsigned char *ref;
    int                  mvoffsets[2];
-    ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
-     +frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
-     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+     _state->frag_mvs[_fragi])>1){
      oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
-       _dct_coeffs);
+       _dct_coeffs+64);
    }
-    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
  }
 }

 /*We copy these entire function to inline the actual MMX routines so that we
   use only a single indirect call.*/

-/*Copies the fragments specified by the lists of fragment indices from one
-   frame to another.
-  _fragis:    A pointer to a list of fragment indices.
-  _nfragis:   The number of fragment indices to copy.
-  _dst_frame: The reference frame to copy to.
-  _src_frame: The reference frame to copy from.
-  _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  const ptrdiff_t     *frag_buf_offs;
-  const unsigned char *src_frame_data;
-  unsigned char       *dst_frame_data;
-  ptrdiff_t            fragii;
-  int                  ystride;
-  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
-  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_ystride[_pli];
-  frag_buf_offs=_state->frag_buf_offs;
-  for(fragii=0;fragii<_nfragis;fragii++){
-    ptrdiff_t frag_buf_off;
-    frag_buf_off=frag_buf_offs[_fragis[fragii]];
-#define SRC edx
-#define DST eax
-#define YSTRIDE ecx
-#define YSTRIDE3 edi
-    OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
-     src_frame_data+frag_buf_off,ystride);
-#undef SRC
-#undef DST
-#undef YSTRIDE
-#undef YSTRIDE3
-  }
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
+  memset(_bv,~(_flimit<<1),8);
 }

 /*Apply the loop filter to a given set of fragment rows in the given plane.
@ -144,8 +111,7 @@ void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
  _fragy0:    The Y coordinate of the first fragment row to filter.
  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
-  OC_ALIGN8(unsigned char  ll[8]);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
  const oc_fragment_plane *fplane;
  const oc_fragment       *frags;
  const ptrdiff_t         *frag_buf_offs;
@ -156,13 +122,12 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
  ptrdiff_t                fragi0_end;
  int                      ystride;
  int                      nhfrags;
-  memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
  fplane=_state->fplanes+_pli;
  nhfrags=fplane->nhfrags;
  fragi_top=fplane->froffset;
  fragi_bot=fragi_top+fplane->nfrags;
  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
-  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
  ystride=_state->ref_ystride[_pli];
  frags=_state->frags;
  frag_buf_offs=_state->frag_buf_offs;
@ -187,13 +152,13 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
 #define LL edx
 #define D esi
 #define D_WORD si
-        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
-        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
+        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv);
+        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv);
        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
-          OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
+          OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv);
        }
        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
-          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
+          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,_bv);
        }
 #undef PIX
 #undef YSTRIDE3
--- a/media/libtheora/lib/x86_vc/x86cpu.c
+++ b/media/libtheora/lib/x86_vc/x86cpu.c
@ -14,41 +14,17 @@
  Originally written by Rudolf Marek.

 function:
-  last mod: $Id: cpu.c 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id: x86cpu.c 17410 2010-09-21 21:53:48Z tterribe $

 ********************************************************************/

-#include "cpu.h"
+#include "x86cpu.h"

 #if !defined(OC_X86_ASM)
-static ogg_uint32_t oc_cpu_flags_get(void){
+ogg_uint32_t oc_cpu_flags_get(void){
  return 0;
 }
 #else
-# if !defined(_MSC_VER)
-#  if defined(__amd64__)||defined(__x86_64__)
-/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
-   compiling with -fPIC.*/
-#   define cpuid(_op,_eax,_ebx,_ecx,_edx) \
-  __asm__ __volatile__( \
-   "cpuid\n\t" \
-   :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
-   :"a"(_op) \
-   :"cc" \
-  )
-#  else
-/*On x86-32, not so much.*/
-#   define cpuid(_op,_eax,_ebx,_ecx,_edx) \
-  __asm__ __volatile__( \
-   "xchgl %%ebx,%[ebx]\n\t" \
-   "cpuid\n\t" \
-   "xchgl %%ebx,%[ebx]\n\t" \
-   :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
-   :"a"(_op) \
-   :"cc" \
-  )
-#  endif
-# else
 /*Why does MSVC need this complicated rigamarole?
  At this point I honestly do not care.*/

@ -95,7 +71,6 @@ static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
    mov [ecx],ebx
  }
 }
-# endif

 static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
  ogg_uint32_t flags;
@ -124,7 +99,7 @@ static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
  return flags;
 }

-static ogg_uint32_t oc_cpu_flags_get(void){
+ogg_uint32_t oc_cpu_flags_get(void){
  ogg_uint32_t flags;
  ogg_uint32_t eax;
  ogg_uint32_t ebx;
@ -132,25 +107,7 @@ static ogg_uint32_t oc_cpu_flags_get(void){
  ogg_uint32_t edx;
 # if !defined(__amd64__)&&!defined(__x86_64__)
  /*Not all x86-32 chips support cpuid, so we have to check.*/
-#  if !defined(_MSC_VER)
-  __asm__ __volatile__(
-   "pushfl\n\t"
-   "pushfl\n\t"
-   "popl %[a]\n\t"
-   "movl %[a],%[b]\n\t"
-   "xorl $0x200000,%[a]\n\t"
-   "pushl %[a]\n\t"
-   "popfl\n\t"
-   "pushfl\n\t"
-   "popl %[a]\n\t"
-   "popfl\n\t"
-   :[a]"=r"(eax),[b]"=r"(ebx)
-   :
-   :"cc"
-  );
-#  else
  oc_detect_cpuid_helper(&eax,&ebx);
-#  endif
  /*No cpuid.*/
  if(eax==ebx)return 0;
 # endif
@ -159,9 +116,18 @@ static ogg_uint32_t oc_cpu_flags_get(void){
  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
   /*      6 8 x M          T e n i          u n e G*/
   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+    int family;
+    int model;
    /*Intel, Transmeta (tested with Crusoe TM5800):*/
    cpuid(1,eax,ebx,ecx,edx);
    flags=oc_parse_intel_flags(edx,ecx);
+    family=(eax>>8)&0xF;
+    model=(eax>>4)&0xF;
+    /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
+       unit, so don't use it.*/
+    if(family==6&&(model==9||model==13||model==14)){
+      flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
+    }
  }
  /*              D M A c          i t n e          h t u A*/
  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
--- a/media/libtheora/lib/x86_vc/x86cpu.h
+++ b/media/libtheora/lib/x86_vc/x86cpu.h
@ -0,0 +1,36 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id: x86cpu.h 17410 2010-09-21 21:53:48Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_vc_x86cpu_H)
+# define _x86_vc_x86cpu_H (1)
+#include "../internal.h"
+
+#define OC_CPU_X86_MMX      (1<<0)
+#define OC_CPU_X86_3DNOW    (1<<1)
+#define OC_CPU_X86_3DNOWEXT (1<<2)
+#define OC_CPU_X86_MMXEXT   (1<<3)
+#define OC_CPU_X86_SSE      (1<<4)
+#define OC_CPU_X86_SSE2     (1<<5)
+#define OC_CPU_X86_PNI      (1<<6)
+#define OC_CPU_X86_SSSE3    (1<<7)
+#define OC_CPU_X86_SSE4_1   (1<<8)
+#define OC_CPU_X86_SSE4_2   (1<<9)
+#define OC_CPU_X86_SSE4A    (1<<10)
+#define OC_CPU_X86_SSE5     (1<<11)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif
--- a/media/libtheora/lib/x86_vc/x86int.h
+++ b/media/libtheora/lib/x86_vc/x86int.h
@ -11,32 +11,39 @@
 ********************************************************************

  function:
-    last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: x86int.h 17410 2010-09-21 21:53:48Z tterribe $

 ********************************************************************/

 #if !defined(_x86_vc_x86int_H)
 # define _x86_vc_x86int_H (1)
 # include "../internal.h"
+# if defined(OC_X86_ASM)
+#  define oc_state_accel_init oc_state_accel_init_x86
+#  define OC_STATE_USE_VTABLE (1)
+# endif
+# include "../state.h"
+# include "x86cpu.h"

-void oc_state_vtable_init_x86(oc_theora_state *_state);
+void oc_state_accel_init_x86(oc_theora_state *_state);

 void oc_frag_copy_mmx(unsigned char *_dst,
 const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
 const ogg_int16_t *_residue);
 void oc_frag_recon_inter_mmx(unsigned char *_dst,
 const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
 const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_mmx(void);

 #endif
--- a/media/libtheora/lib/x86_vc/x86state.c
+++ b/media/libtheora/lib/x86_vc/x86state.c
@ -11,7 +11,7 @@
 ********************************************************************

  function:
-    last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id: x86state.c 17410 2010-09-21 21:53:48Z tterribe $

 ********************************************************************/

@ -19,8 +19,6 @@

 #if defined(OC_X86_ASM)

-#include "../cpu.c"
-
 /*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
   each quadrant of the destination.*/
 static const unsigned char OC_FZIG_ZAG_MMX[128]={
@ -42,21 +40,22 @@ static const unsigned char OC_FZIG_ZAG_MMX[128]={
  64,64,64,64,64,64,64,64,
 };

-void oc_state_vtable_init_x86(oc_theora_state *_state){
+void oc_state_accel_init_x86(oc_theora_state *_state){
  _state->cpu_flags=oc_cpu_flags_get();
  if(_state->cpu_flags&OC_CPU_X86_MMX){
    _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
    _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
-    _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
    _state->opt_vtable.state_loop_filter_frag_rows=
     oc_state_loop_filter_frag_rows_mmx;
    _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
  }
-  else oc_state_vtable_init_c(_state);
+  else oc_state_accel_init_c(_state);
 }
 #endif
--- a/media/libtheora/update.sh
+++ b/media/libtheora/update.sh
@ -2,11 +2,20 @@
 #
 # Copies the needed files from a directory containing the original
 # libtheora source that we need for the Mozilla HTML5 media support.
-sed s/\#define\ OC_X86_ASM//g $1/config.h >./lib/config.h
-sed s/\#define\ USE_ASM//g ./lib/config.h >./lib/config.h2
-sed s/\#define\ THEORA_DISABLE_ENCODE//g ./lib/config.h2 >./lib/config.h
-rm ./lib/config.h2
-cp ./lib/config.h ./include/theora/config.h
+sed \
+ -e s/\#define\ OC_X86_ASM//g \
+ -e s/\#define\ OC_X86_64_ASM//g \
+ -e s/\#define\ OC_ARM_ASM_EDSP\ 1//g \
+ -e s/\#define\ OC_ARM_ASM_MEDIA\ 1//g \
+ -e s/\#define\ OC_ARM_ASM_NEON\ 1//g \
+ -e s/\#define\ OC_ARM_ASM//g \
+ -e s/\#define\ THEORA_DISABLE_ENCODE//g \
+ $1/config.h > lib/config.h
+sed \
+ -e s/@HAVE_ARM_ASM_EDSP@/1/g \
+ -e s/@HAVE_ARM_ASM_MEDIA@/1/g \
+ -e s/@HAVE_ARM_ASM_NEON@/1/g \
+ $1/lib/arm/armopts.s.in > lib/arm/armopts.s
 cp $1/LICENSE ./LICENSE
 cp $1/CHANGES ./CHANGES
 cp $1/COPYING ./COPYING
@ -16,8 +25,6 @@ cp $1/lib/apiwrapper.c ./lib/
 cp $1/lib/apiwrapper.h ./lib/
 cp $1/lib/bitpack.c ./lib/
 cp $1/lib/bitpack.h ./lib/
-cp $1/lib/cpu.c ./lib/
-cp $1/lib/cpu.h ./lib/
 cp $1/lib/dct.h ./lib/
 cp $1/lib/decapiwrapper.c ./lib/
 cp $1/lib/decinfo.c ./lib/
@ -25,13 +32,9 @@ cp $1/lib/decint.h ./lib/
 cp $1/lib/decode.c ./lib/
 cp $1/lib/dequant.c ./lib/
 cp $1/lib/dequant.h ./lib/
-cp $1/lib/encint.h ./lib/
-cp $1/lib/encoder_disabled.c ./lib/
-cp $1/lib/enquant.h ./lib/
 cp $1/lib/fragment.c ./lib/
 cp $1/lib/huffdec.c ./lib/
 cp $1/lib/huffdec.h ./lib/
-cp $1/lib/huffenc.h ./lib/
 cp $1/lib/huffman.h ./lib/
 cp $1/lib/idct.c ./lib/
 cp $1/lib/info.c ./lib/
@ -42,22 +45,36 @@ cp $1/lib/ocintrin.h ./lib/
 cp $1/lib/quant.c ./lib/
 cp $1/lib/quant.h ./lib/
 cp $1/lib/state.c ./lib/
+cp $1/lib/state.h ./lib/
+cp $1/lib/arm/arm2gnu.pl ./lib/arm/
+cp $1/lib/arm/armbits.h ./lib/arm/
+cp $1/lib/arm/armbits.s ./lib/arm/
+cp $1/lib/arm/armcpu.c ./lib/arm/
+cp $1/lib/arm/armcpu.h ./lib/arm/
+cp $1/lib/arm/armfrag.s ./lib/arm/
+cp $1/lib/arm/armidct.s ./lib/arm/
+cp $1/lib/arm/armint.h ./lib/arm/
+cp $1/lib/arm/armloop.s ./lib/arm/
+cp $1/lib/arm/armstate.c ./lib/arm/
 cp $1/lib/x86/mmxfrag.c ./lib/x86/
-cp $1/lib/x86/mmxfrag.h ./lib/x86/
 cp $1/lib/x86/mmxidct.c ./lib/x86/
 cp $1/lib/x86/mmxloop.h ./lib/x86/
 cp $1/lib/x86/mmxstate.c ./lib/x86/
+cp $1/lib/x86/sse2idct.c ./lib/x86/
+cp $1/lib/x86/sse2trans.h ./lib/x86/
+cp $1/lib/x86/x86cpu.c ./lib/x86/
+cp $1/lib/x86/x86cpu.h ./lib/x86/
 cp $1/lib/x86/x86int.h ./lib/x86/
 cp $1/lib/x86/x86state.c ./lib/x86/
 cp $1/lib/x86_vc/mmxfrag.c ./lib/x86_vc/
-cp $1/lib/x86_vc/mmxfrag.h ./lib/x86_vc/
 cp $1/lib/x86_vc/mmxidct.c ./lib/x86_vc/
 cp $1/lib/x86_vc/mmxloop.h ./lib/x86_vc/
 cp $1/lib/x86_vc/mmxstate.c ./lib/x86_vc/
+cp $1/lib/x86_vc/x86cpu.c ./lib/x86_vc/
+cp $1/lib/x86_vc/x86cpu.h ./lib/x86_vc/
 cp $1/lib/x86_vc/x86int.h ./lib/x86_vc/
 cp $1/lib/x86_vc/x86state.c ./lib/x86_vc/
 cp $1/include/theora/theora.h ./include/theora/theora.h
 cp $1/include/theora/theoradec.h ./include/theora/theoradec.h
 cp $1/include/theora/theoraenc.h ./include/theora/theoraenc.h
 cp $1/include/theora/codec.h ./include/theora/codec.h
-patch -p3 <bug559343.patch