mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-24 13:21:05 +00:00
Bug 608166 - Add ARM assembly optimizations for libtheora r=chris.double,tterribe,khuey a=b-f
This commit is contained in:
parent
dc49da79ab
commit
bed266bd26
@ -16,6 +16,9 @@ Nils Pipenbrinck
|
||||
Monty
|
||||
- MMX optimized functions
|
||||
|
||||
David Schleef
|
||||
- C64x port
|
||||
|
||||
Aaron Colwell
|
||||
Thomas Vander Stichele
|
||||
Jan Gerber
|
||||
@ -45,5 +48,7 @@ Arc Riley
|
||||
Rodolphe Ortalo
|
||||
- Bug fixes
|
||||
|
||||
Robin Watts
|
||||
- ARM code optimisations
|
||||
|
||||
and other Xiph.org contributors
|
||||
|
@ -1,6 +1,26 @@
|
||||
libteora 1.2.0alpha1 (2010 September 23)
|
||||
|
||||
- New 'ptalarbvorm' encoder with better rate/distortion optimization
|
||||
- New th_encode_ctl option for copying configuration from an existing
|
||||
setup header, useful for splicing streams.
|
||||
- Returns TH_DUPFRAME in more cases.
|
||||
- Add ARM optimizations
|
||||
- Add TI C64x+ DSP optimizations
|
||||
- Other performance improvements
|
||||
- Rename speedlevel 2 to 3 and provide a new speedlevel 2
|
||||
- Various minor bug fixes
|
||||
|
||||
libtheora 1.1.2 (unreleased snapshot)
|
||||
|
||||
- no changes recorded
|
||||
- Fix Huffman table decoding with OC_HUFF_SLUSH is set to 0
|
||||
- Fix a frame size bug in player_example
|
||||
- Add support for passing a buffer the size of the picture
|
||||
region, rather than a full padded frame to th_encode_ycbcr_in()
|
||||
as was possible with the legacy pre-1.0 API.
|
||||
- 4:4:4 support in player_example using software yuv->rgb
|
||||
- Better rgb->yuv conversion in png2theora
|
||||
- Clean up warnings and local variables
|
||||
- Build and documentation fixes
|
||||
|
||||
libtheora 1.1.1 (2009 October 1)
|
||||
|
||||
@ -128,7 +148,7 @@ libtheora 1.0beta1 (2007 September 22)
|
||||
- Granulepos scheme modified to match other codecs. This bumps
|
||||
the bitstream revision to 3.2.1. Bitstreams marked 3.2.0 are
|
||||
handled correctly by this decoder. Older decoders will show
|
||||
a one frame sync error in the less noticable direction.
|
||||
a one frame sync error in the less noticeable direction.
|
||||
|
||||
libtheora 1.0alpha8 (2007 September 18)
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
-------------------------------------------------------------------------
|
||||
The Xiph.org Foundation's libtheora 1.1
|
||||
The Xiph.org Foundation's libtheora 1.2
|
||||
-------------------------------------------------------------------------
|
||||
|
||||
*** What is Theora?
|
||||
@ -12,10 +12,13 @@ while allow it a longer useful lifetime as an competitive codec.
|
||||
The 1.0 release decoder supported all the new features, but the
|
||||
encoder is nearly identical to the VP3 code.
|
||||
|
||||
The 1.1 release features a completely rewritten encoder, offering
|
||||
The 1.1 release featured a completely rewritten encoder, offering
|
||||
better performance and compression, and making more complete use
|
||||
of the format's feature set. Files produced by both encoders can
|
||||
be decoded by either release.
|
||||
of the format's feature set.
|
||||
|
||||
The 1.2 release features significant additional improvements in
|
||||
compression and performance. Files produced by newer encoders can
|
||||
be decoded by earlier releases.
|
||||
|
||||
*** Where is Theora?
|
||||
|
||||
@ -41,6 +44,7 @@ Requirements summary:
|
||||
as above,
|
||||
|
||||
libvorbis and libvorbisenc 1.0.1 or newer.
|
||||
(libvorbis 1.3.1 or newer for 5.1 audio)
|
||||
|
||||
For creating a source distribution package:
|
||||
|
||||
@ -66,7 +70,7 @@ Windows build support is included in the win32 directory.
|
||||
|
||||
Project files for Apple XCode are included in the macosx directory.
|
||||
|
||||
There is also an experimental scons build.
|
||||
There is also a more limited scons build.
|
||||
|
||||
*** How do I use the sample encoder?
|
||||
|
||||
|
@ -2,6 +2,4 @@ The source from this directory was copied from the theora subversion trunk
|
||||
using the update.sh script. The changes made were those applied by update.sh,
|
||||
the addition/update of Makefile.in files for the Mozilla build system.
|
||||
|
||||
The subversion revision used was r16712.
|
||||
|
||||
bug559343.patch: Silence Coverity warning.
|
||||
The subversion revision used was r17578.
|
||||
|
@ -1,22 +0,0 @@
|
||||
diff --git a/media/libtheora/lib/state.c b/media/libtheora/lib/state.c
|
||||
--- a/media/libtheora/lib/state.c
|
||||
+++ b/media/libtheora/lib/state.c
|
||||
@@ -87,17 +87,17 @@ static void oc_sb_create_plane_mapping(o
|
||||
int quadi;
|
||||
int i;
|
||||
/*Figure out how many rows of blocks in this super block lie within the
|
||||
image.*/
|
||||
jmax=_hfrags-x;
|
||||
if(jmax>4)jmax=4;
|
||||
else if(jmax<=0)break;
|
||||
/*By default, set all fragment indices to -1.*/
|
||||
- memset(_sb_maps[sbi][0],0xFF,sizeof(_sb_maps[sbi]));
|
||||
+ memset(_sb_maps[sbi],0xFF,sizeof(_sb_maps[sbi]));
|
||||
/*Fill in the fragment map for this super block.*/
|
||||
xfrag=yfrag+x;
|
||||
for(i=0;i<imax;i++){
|
||||
int j;
|
||||
for(j=0;j<jmax;j++){
|
||||
_sb_maps[sbi][SB_MAP[i][j][0]][SB_MAP[i][j][1]]=xfrag+j;
|
||||
}
|
||||
xfrag+=_hfrags;
|
@ -1,90 +0,0 @@
|
||||
/* config.h. Generated from config.h.in by configure. */
|
||||
/* config.h.in. Generated from configure.ac by autoheader. */
|
||||
|
||||
/* libcairo is available for visual debugging output */
|
||||
/* #undef HAVE_CAIRO */
|
||||
|
||||
/* Define to 1 if you have the <dlfcn.h> header file. */
|
||||
#define HAVE_DLFCN_H 1
|
||||
|
||||
/* Define to 1 if you have the <inttypes.h> header file. */
|
||||
#define HAVE_INTTYPES_H 1
|
||||
|
||||
/* Define to 1 if you have the <machine/soundcard.h> header file. */
|
||||
/* #undef HAVE_MACHINE_SOUNDCARD_H */
|
||||
|
||||
/* Define to 1 if you have the <memory.h> header file. */
|
||||
#define HAVE_MEMORY_H 1
|
||||
|
||||
/* Define to 1 if you have the <soundcard.h> header file. */
|
||||
/* #undef HAVE_SOUNDCARD_H */
|
||||
|
||||
/* Define to 1 if you have the <stdint.h> header file. */
|
||||
#define HAVE_STDINT_H 1
|
||||
|
||||
/* Define to 1 if you have the <stdlib.h> header file. */
|
||||
#define HAVE_STDLIB_H 1
|
||||
|
||||
/* Define to 1 if you have the <strings.h> header file. */
|
||||
#define HAVE_STRINGS_H 1
|
||||
|
||||
/* Define to 1 if you have the <string.h> header file. */
|
||||
#define HAVE_STRING_H 1
|
||||
|
||||
/* Define to 1 if you have the <sys/soundcard.h> header file. */
|
||||
/* #undef HAVE_SYS_SOUNDCARD_H */
|
||||
|
||||
/* Define to 1 if you have the <sys/stat.h> header file. */
|
||||
#define HAVE_SYS_STAT_H 1
|
||||
|
||||
/* Define to 1 if you have the <sys/types.h> header file. */
|
||||
#define HAVE_SYS_TYPES_H 1
|
||||
|
||||
/* Define to 1 if you have the <unistd.h> header file. */
|
||||
#define HAVE_UNISTD_H 1
|
||||
|
||||
/* Define to the sub-directory in which libtool stores uninstalled libraries.
|
||||
*/
|
||||
#define LT_OBJDIR ".libs/"
|
||||
|
||||
/* Define to 1 if your C compiler doesn't accept -c and -o together. */
|
||||
/* #undef NO_MINUS_C_MINUS_O */
|
||||
|
||||
/* make use of x86_64 asm optimization */
|
||||
/* #undef OC_X86_64_ASM */
|
||||
|
||||
/* make use of x86 asm optimization */
|
||||
/**/
|
||||
|
||||
/* Name of package */
|
||||
#define PACKAGE "libtheora"
|
||||
|
||||
/* Define to the address where bug reports for this package should be sent. */
|
||||
#define PACKAGE_BUGREPORT ""
|
||||
|
||||
/* Define to the full name of this package. */
|
||||
#define PACKAGE_NAME "libtheora"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "libtheora 1.1.1+svn"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "libtheora"
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "1.1.1+svn"
|
||||
|
||||
/* Define to 1 if you have the ANSI C header files. */
|
||||
#define STDC_HEADERS 1
|
||||
|
||||
/* Define to exclude encode support from the build */
|
||||
/* #undef THEORA_DISABLE_ENCODE */
|
||||
|
||||
/* Define to exclude floating point code from the build */
|
||||
/* #undef THEORA_DISABLE_FLOAT */
|
||||
|
||||
/* Version number of package */
|
||||
#define VERSION "1.1.1+svn"
|
@ -179,7 +179,7 @@ typedef enum {
|
||||
OC_PF_420, /**< Chroma subsampling by 2 in each direction (4:2:0) */
|
||||
OC_PF_RSVD, /**< Reserved value */
|
||||
OC_PF_422, /**< Horizonatal chroma subsampling by 2 (4:2:2) */
|
||||
OC_PF_444, /**< No chroma subsampling at all (4:4:4) */
|
||||
OC_PF_444 /**< No chroma subsampling at all (4:4:4) */
|
||||
} theora_pixelformat;
|
||||
|
||||
/**
|
||||
|
@ -283,7 +283,8 @@ extern int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
|
||||
* \retval 0 Success.
|
||||
* A new decoded frame can be retrieved by calling
|
||||
* th_decode_ycbcr_out().
|
||||
* \retval TH_DUPFRAME The packet represented a dropped (0-byte) frame.
|
||||
* \retval TH_DUPFRAME The packet represented a dropped frame (either a
|
||||
* 0-byte frame or an INTER frame with no coded blocks).
|
||||
* The player can skip the call to th_decode_ycbcr_out(),
|
||||
* as the contents of the decoded frame buffer have not
|
||||
* changed.
|
||||
|
@ -43,7 +43,7 @@ extern "C" {
|
||||
* <tt>NULL</tt> may be specified to revert to the default tables.
|
||||
*
|
||||
* \param[in] _buf <tt>#th_huff_code[#TH_NHUFFMAN_TABLES][#TH_NDCT_TOKENS]</tt>
|
||||
* \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL Encoding has already begun or one or more of the given
|
||||
* tables is not full or prefix-free, \a _buf is
|
||||
* <tt>NULL</tt> and \a _buf_sz is not zero, or \a _buf is
|
||||
@ -57,7 +57,7 @@ extern "C" {
|
||||
* <tt>NULL</tt> may be specified to revert to the default parameters.
|
||||
*
|
||||
* \param[in] _buf #th_quant_info
|
||||
* \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL Encoding has already begun, \a _buf is
|
||||
* <tt>NULL</tt> and \a _buf_sz is not zero,
|
||||
* or \a _buf is non-<tt>NULL</tt> and
|
||||
@ -73,7 +73,7 @@ extern "C" {
|
||||
* \param[in] _buf <tt>ogg_uint32_t</tt>: The maximum distance between key
|
||||
* frames.
|
||||
* \param[out] _buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
|
||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
|
||||
@ -101,7 +101,7 @@ extern "C" {
|
||||
* 4:2:0, the picture region is smaller than the full frame,
|
||||
* or if encoding has begun, preventing the quantization
|
||||
* tables and codebooks from being set.
|
||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_VP3_COMPATIBLE (10)
|
||||
@ -114,7 +114,7 @@ extern "C" {
|
||||
* the current encoding mode (VBR vs. constant quality, etc.).
|
||||
*
|
||||
* \param[out] _buf <tt>int</tt>: The maximum encoding speed level.
|
||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
|
||||
* \retval TH_EIMPL Not supported by this implementation in the current
|
||||
* encoding mode.*/
|
||||
@ -124,7 +124,7 @@ extern "C" {
|
||||
*
|
||||
* \param[in] _buf <tt>int</tt>: The new encoding speed level.
|
||||
* 0 is slowest, larger values use less CPU.
|
||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
|
||||
* encoding speed level is out of bounds.
|
||||
* The maximum encoding speed level may be
|
||||
@ -142,7 +142,7 @@ extern "C" {
|
||||
*
|
||||
* \param[out] _buf <tt>int</tt>: The current encoding speed level.
|
||||
* 0 is slowest, larger values use less CPU.
|
||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
|
||||
* \retval TH_EIMPL Not supported by this implementation in the current
|
||||
* encoding mode.*/
|
||||
@ -162,7 +162,7 @@ extern "C" {
|
||||
*
|
||||
* \param[in] _buf <tt>int</tt>: The number of duplicates to produce.
|
||||
* If this is negative or zero, no duplicates will be produced.
|
||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
|
||||
* number of duplicates is greater than or equal to the
|
||||
* maximum keyframe interval.
|
||||
@ -187,7 +187,7 @@ extern "C" {
|
||||
* use.
|
||||
* - #TH_RATECTL_CAP_UNDERFLOW: Don't try to make up shortfalls
|
||||
* later.
|
||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt> or rate control
|
||||
* is not enabled.
|
||||
* \retval TH_EIMPL Not supported by this implementation in the current
|
||||
@ -211,7 +211,7 @@ extern "C" {
|
||||
* \param[in] _buf <tt>int</tt>: Requested size of the reservoir measured in
|
||||
* frames.
|
||||
* \param[out] _buf <tt>int</tt>: The actual size of the reservoir set.
|
||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or rate control
|
||||
* is not enabled. The buffer has an implementation
|
||||
* defined minimum and maximum size and the value in _buf
|
||||
@ -243,7 +243,7 @@ extern "C" {
|
||||
* application.
|
||||
* \retval >=0 The number of bytes of metric data available in the
|
||||
* returned buffer.
|
||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(char *)</tt>, no target
|
||||
* bitrate has been set, or the first call was made after
|
||||
* the first frame was submitted for encoding.
|
||||
@ -283,7 +283,7 @@ extern "C" {
|
||||
* of bytes consumed.
|
||||
* \retval >0 The number of bytes of metric data required/consumed.
|
||||
* \retval 0 No more data is required before the next frame.
|
||||
* \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL No target bitrate has been set, or the first call was
|
||||
* made after the first frame was submitted for
|
||||
* encoding.
|
||||
@ -306,7 +306,7 @@ extern "C" {
|
||||
* \param[in] _buf <tt>int</tt>: The new target quality, in the range 0...63,
|
||||
* inclusive.
|
||||
* \retval 0 Success.
|
||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL A target bitrate has already been specified, or the
|
||||
* quality index was not in the range 0...63.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
@ -328,10 +328,50 @@ extern "C" {
|
||||
*
|
||||
* \param[in] _buf <tt>long</tt>: The new target bitrate, in bits per second.
|
||||
* \retval 0 Success.
|
||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL The target bitrate was not positive.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_BITRATE (30)
|
||||
/**Sets the configuration to be compatible with that from the given setup
|
||||
* header.
|
||||
* This sets the Huffman codebooks and quantization parameters to match those
|
||||
* found in the given setup header.
|
||||
* This guarantees that packets encoded by this encoder will be decodable using
|
||||
* a decoder configured with the passed-in setup header.
|
||||
* It does <em>not</em> guarantee that th_encode_flushheader() will produce a
|
||||
* bit-identical setup header, only that they will be compatible.
|
||||
* If you need a bit-identical setup header, then use the one you passed into
|
||||
* this command, and not the one returned by th_encode_flushheader().
|
||||
*
|
||||
* This also does <em>not</em> enable or disable VP3 compatibility; that is not
|
||||
* signaled in the setup header (or anywhere else in the encoded stream), and
|
||||
* is controlled independently by the #TH_ENCCTL_SET_VP3_COMPATIBLE function.
|
||||
* If you wish to enable VP3 compatibility mode <em>and</em> want the codebooks
|
||||
* and quantization parameters to match the given setup header, you should
|
||||
* enable VP3 compatibility before invoking this command, otherwise the
|
||||
* codebooks and quantization parameters will be reset to the VP3 defaults.
|
||||
*
|
||||
* The current encoder does not support Huffman codebooks which do not contain
|
||||
* codewords for all 32 tokens.
|
||||
* Such codebooks are legal, according to the specification, but cannot be
|
||||
* configured with this function.
|
||||
*
|
||||
* \param[in] _buf <tt>unsigned char[]</tt>: The encoded setup header to copy
|
||||
* the configuration from.
|
||||
* This should be the original,
|
||||
* undecoded setup header packet,
|
||||
* and <em>not</em> a #th_setup_info
|
||||
* structure filled in by
|
||||
* th_decode_headerin().
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL Encoding has already begun, so the codebooks and
|
||||
* quantization parameters cannot be changed, or the
|
||||
* data in the setup header was not supported by this
|
||||
* encoder.
|
||||
* \retval TH_EBADHEADER \a _buf did not contain a valid setup header packet.
|
||||
* \retval TH_ENOTFORMAT \a _buf did not contain a Theora header at all.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_COMPAT_CONFIG (32)
|
||||
|
||||
/*@}*/
|
||||
|
||||
@ -441,11 +481,25 @@ extern int th_encode_flushheader(th_enc_ctx *_enc,
|
||||
/**Submits an uncompressed frame to the encoder.
|
||||
* \param _enc A #th_enc_ctx handle.
|
||||
* \param _ycbcr A buffer of Y'CbCr data to encode.
|
||||
* If the width and height of the buffer matches the frame size
|
||||
* the encoder was initialized with, the encoder will only
|
||||
* reference the portion inside the picture region.
|
||||
* Any data outside this region will be ignored, and need not map
|
||||
* to a valid address.
|
||||
* Alternatively, you can pass a buffer equal to the size of the
|
||||
* picture region, if this is less than the full frame size.
|
||||
* When using subsampled chroma planes, odd picture sizes or odd
|
||||
* picture offsets may require an unexpected chroma plane size,
|
||||
* and their use is generally discouraged, as they will not be
|
||||
* well-supported by players and other media frameworks.
|
||||
* See Section 4.4 of
|
||||
* <a href="http://www.theora.org/doc/Theora.pdf">the Theora
|
||||
* specification</a> for details if you wish to use them anyway.
|
||||
* \retval 0 Success.
|
||||
* \retval TH_EFAULT \a _enc or \a _ycbcr is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL The buffer size does not match the frame size the encoder
|
||||
* was initialized with, or encoding has already
|
||||
* completed.*/
|
||||
* \retval TH_EINVAL The buffer size matches neither the frame size nor the
|
||||
* picture size the encoder was initialized with, or
|
||||
* encoding has already completed.*/
|
||||
extern int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _ycbcr);
|
||||
/**Retrieves encoded video data packets.
|
||||
* This should be called repeatedly after each frame is submitted to flush any
|
||||
|
@ -34,15 +34,15 @@
|
||||
#
|
||||
# ***** END LICENSE BLOCK *****
|
||||
|
||||
DEPTH = ../../..
|
||||
topsrcdir = @top_srcdir@
|
||||
srcdir = @srcdir@
|
||||
DEPTH = ../../..
|
||||
topsrcdir = @top_srcdir@
|
||||
srcdir = @srcdir@
|
||||
|
||||
include $(DEPTH)/config/autoconf.mk
|
||||
|
||||
MODULE = theora
|
||||
LIBRARY_NAME = theora
|
||||
FORCE_STATIC_LIB= 1
|
||||
MODULE = theora
|
||||
LIBRARY_NAME = theora
|
||||
FORCE_STATIC_LIB = 1
|
||||
|
||||
# The encoder is currently not included.
|
||||
DEFINES += -DTHEORA_DISABLE_ENCODE
|
||||
@ -50,51 +50,103 @@ DEFINES += -DTHEORA_DISABLE_ENCODE
|
||||
ifeq ($(findstring 86,$(OS_TEST)), 86)
|
||||
ifneq ($(OS_ARCH),SunOS)
|
||||
ifneq ($(OS_ARCH)$(OS_TEST),WINNTx86_64)
|
||||
DEFINES += -DOC_X86_ASM -DUSE_ASM
|
||||
DEFINES += -DOC_X86_ASM
|
||||
ifeq (64,$(findstring 64,$(OS_TEST)))
|
||||
DEFINES += -DOC_X86_64_ASM
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
VPATH := $(srcdir)
|
||||
VPATH := $(srcdir)
|
||||
|
||||
CSRCS = \
|
||||
apiwrapper.c \
|
||||
bitpack.c \
|
||||
decapiwrapper.c \
|
||||
decinfo.c \
|
||||
decode.c \
|
||||
dequant.c \
|
||||
encoder_disabled.c \
|
||||
fragment.c \
|
||||
huffdec.c \
|
||||
idct.c \
|
||||
info.c \
|
||||
internal.c \
|
||||
quant.c \
|
||||
state.c \
|
||||
$(NULL)
|
||||
CSRCS = \
|
||||
apiwrapper.c \
|
||||
bitpack.c \
|
||||
decapiwrapper.c \
|
||||
decinfo.c \
|
||||
decode.c \
|
||||
dequant.c \
|
||||
fragment.c \
|
||||
huffdec.c \
|
||||
idct.c \
|
||||
info.c \
|
||||
internal.c \
|
||||
quant.c \
|
||||
state.c \
|
||||
$(NULL)
|
||||
|
||||
ifeq ($(findstring 86,$(OS_TEST)), 86)
|
||||
ifdef _MSC_VER
|
||||
ifneq (64,$(findstring 64,$(OS_TEST)))
|
||||
VPATH += $(srcdir)/x86_vc
|
||||
VPATH += $(srcdir)/x86_vc
|
||||
|
||||
CSRCS += \
|
||||
mmxidct.c \
|
||||
mmxfrag.c \
|
||||
mmxstate.c \
|
||||
x86state.c \
|
||||
$(NULL)
|
||||
CSRCS += \
|
||||
mmxidct.c \
|
||||
mmxfrag.c \
|
||||
mmxstate.c \
|
||||
x86state.c \
|
||||
x86cpu.c \
|
||||
$(NULL)
|
||||
endif
|
||||
else
|
||||
VPATH += $(srcdir)/x86
|
||||
VPATH += $(srcdir)/x86
|
||||
|
||||
CSRCS += \
|
||||
mmxidct.c \
|
||||
mmxfrag.c \
|
||||
mmxstate.c \
|
||||
sse2idct.c \
|
||||
x86state.c \
|
||||
x86cpu.c \
|
||||
$(NULL)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef GNU_AS
|
||||
ifeq ($(findstring arm,$(OS_TEST)), arm)
|
||||
|
||||
VPATH += $(srcdir)/arm
|
||||
|
||||
CSRCS += \
|
||||
armcpu.c \
|
||||
armstate.c \
|
||||
$(NULL)
|
||||
|
||||
DEFINES += -DOC_ARM_ASM -DOC_ARM_ASM_EDSP -DOC_ARM_ASM_MEDIA -DOC_ARM_ASM_NEON
|
||||
|
||||
# The Android NDK doesn't pre-define anything to indicate the OS it's on, so
|
||||
# do it for them.
|
||||
ifeq ($(OS_TARGET),Android)
|
||||
DEFINES += -D__linux__
|
||||
endif
|
||||
|
||||
THEORA_ASFILES = \
|
||||
armbits.s \
|
||||
armfrag.s \
|
||||
armidct.s \
|
||||
armloop.s \
|
||||
armopts.s \
|
||||
$(NULL)
|
||||
|
||||
ASFILES = $(patsubst %.s,%-gnu.$(ASM_SUFFIX),$(THEORA_ASFILES))
|
||||
|
||||
# These flags are a lie; they're just used to enable the requisite
|
||||
# opcodes; actual arch detection is done at runtime.
|
||||
ASFLAGS = -march=armv7-a -mfpu=neon
|
||||
|
||||
armfrag-gnu.$(ASM_SUFFIX): armopts-gnu.S
|
||||
armidct-gnu.$(ASM_SUFFIX): armopts-gnu.S
|
||||
armloop-gnu.$(ASM_SUFFIX): armopts-gnu.S
|
||||
|
||||
# armopts needs a specific rule, because arm2gnu.pl will always add the .S
|
||||
# suffix when translating the files that include it.
|
||||
armopts-gnu.S: armopts.s
|
||||
$(PERL) $(srcdir)/arm/arm2gnu.pl < $< > $@
|
||||
# For all others, we can use an implicit rule with the configured $(ASM_SUFFIX).
|
||||
%-gnu.$(ASM_SUFFIX): %.s
|
||||
$(PERL) $(srcdir)/arm/arm2gnu.pl < $< > $@
|
||||
|
||||
CSRCS += \
|
||||
mmxidct.c \
|
||||
mmxfrag.c \
|
||||
mmxstate.c \
|
||||
x86state.c \
|
||||
$(NULL)
|
||||
endif
|
||||
endif
|
||||
|
||||
|
@ -21,7 +21,7 @@
|
||||
# include <theora/theora.h>
|
||||
# include "theora/theoradec.h"
|
||||
# include "theora/theoraenc.h"
|
||||
# include "internal.h"
|
||||
# include "state.h"
|
||||
|
||||
typedef struct th_api_wrapper th_api_wrapper;
|
||||
typedef struct th_api_info th_api_info;
|
||||
|
271
media/libtheora/lib/arm/arm2gnu.pl
Executable file
271
media/libtheora/lib/arm/arm2gnu.pl
Executable file
@ -0,0 +1,271 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
my $bigend; # little/big endian
|
||||
|
||||
eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
|
||||
if $running_under_some_shell;
|
||||
|
||||
while ($ARGV[0] =~ /^-/) {
|
||||
$_ = shift;
|
||||
last if /^--/;
|
||||
if (/^-n/) {
|
||||
$nflag++;
|
||||
next;
|
||||
}
|
||||
die "I don't recognize this switch: $_\\n";
|
||||
}
|
||||
$printit++ unless $nflag;
|
||||
|
||||
$\ = "\n"; # automatically add newline on print
|
||||
$n=0;
|
||||
|
||||
$thumb = 0; # ARM mode by default, not Thumb.
|
||||
|
||||
LINE:
|
||||
while (<>) {
|
||||
|
||||
# For ADRLs we need to add a new line after the substituted one.
|
||||
$addPadding = 0;
|
||||
|
||||
# First, we do not dare to touch *anything* inside double quotes, do we?
|
||||
# Second, if you want a dollar character in the string,
|
||||
# insert two of them -- that's how ARM C and assembler treat strings.
|
||||
s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1: .ascii \"/ && do { s/\$\$/\$/g; next };
|
||||
s/\bDCB\b[ \t]*\"/.ascii \"/ && do { s/\$\$/\$/g; next };
|
||||
s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/ && do { s/\$\$/\$/g; next };
|
||||
# If there's nothing on a line but a comment, don't try to apply any further
|
||||
# substitutions (this is a cheap hack to avoid mucking up the license header)
|
||||
s/^([ \t]*);/$1@/ && do { s/\$\$/\$/g; next };
|
||||
# If substituted -- leave immediately !
|
||||
|
||||
s/@/,:/;
|
||||
s/;/@/;
|
||||
while ( /@.*'/ ) {
|
||||
s/(@.*)'/$1/g;
|
||||
}
|
||||
s/\{FALSE\}/0/g;
|
||||
s/\{TRUE\}/1/g;
|
||||
s/\{(\w\w\w\w+)\}/$1/g;
|
||||
s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
|
||||
s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
|
||||
s/\bIMPORT\b/.extern/;
|
||||
s/\bEXPORT\b/.global/;
|
||||
s/^(\s+)\[/$1IF/;
|
||||
s/^(\s+)\|/$1ELSE/;
|
||||
s/^(\s+)\]/$1ENDIF/;
|
||||
s/IF *:DEF:/ .ifdef/;
|
||||
s/IF *:LNOT: *:DEF:/ .ifndef/;
|
||||
s/ELSE/ .else/;
|
||||
s/ENDIF/ .endif/;
|
||||
|
||||
if( /\bIF\b/ ) {
|
||||
s/\bIF\b/ .if/;
|
||||
s/=/==/;
|
||||
}
|
||||
if ( $n == 2) {
|
||||
s/\$/\\/g;
|
||||
}
|
||||
if ($n == 1) {
|
||||
s/\$//g;
|
||||
s/label//g;
|
||||
$n = 2;
|
||||
}
|
||||
if ( /MACRO/ ) {
|
||||
s/MACRO *\n/.macro/;
|
||||
$n=1;
|
||||
}
|
||||
if ( /\bMEND\b/ ) {
|
||||
s/\bMEND\b/.endm/;
|
||||
$n=0;
|
||||
}
|
||||
|
||||
# ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.
|
||||
#
|
||||
if ( /\bAREA\b/ ) {
|
||||
s/^(.+)CODE(.+)READONLY(.*)/ .text/;
|
||||
s/^(.+)DATA(.+)READONLY(.*)/ .section .rdata\n .align 2/;
|
||||
s/^(.+)\|\|\.data\|\|(.+)/ .data\n .align 2/;
|
||||
s/^(.+)\|\|\.bss\|\|(.+)/ .bss/;
|
||||
}
|
||||
|
||||
s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/; # ||.constdata$3||
|
||||
s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/; # ||.bss$2||
|
||||
s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/; # ||.data$2||
|
||||
s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;
|
||||
s/^(\s+)\%(\s)/ .space $1/;
|
||||
|
||||
s/\|(.+)\.(\d+)\|/\.$1_$2/; # |L80.123| -> .L80_123
|
||||
s/\bCODE32\b/.code 32/ && do {$thumb = 0};
|
||||
s/\bCODE16\b/.code 16/ && do {$thumb = 1};
|
||||
if (/\bPROC\b/)
|
||||
{
|
||||
print " .thumb_func" if ($thumb);
|
||||
s/\bPROC\b/@ $&/;
|
||||
}
|
||||
s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;
|
||||
s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;
|
||||
s/\bENDP\b/@ $&/;
|
||||
s/\bSUBT\b/@ $&/;
|
||||
s/\bDATA\b/@ $&/; # DATA directive is deprecated -- Asm guide, p.7-25
|
||||
s/\bKEEP\b/@ $&/;
|
||||
s/\bEXPORTAS\b/@ $&/;
|
||||
s/\|\|(.)+\bEQU\b/@ $&/;
|
||||
s/\|\|([\w\$]+)\|\|/$1/;
|
||||
s/\bENTRY\b/@ $&/;
|
||||
s/\bASSERT\b/@ $&/;
|
||||
s/\bGBLL\b/@ $&/;
|
||||
s/\bGBLA\b/@ $&/;
|
||||
s/^\W+OPT\b/@ $&/;
|
||||
s/:OR:/|/g;
|
||||
s/:SHL:/<</g;
|
||||
s/:SHR:/>>/g;
|
||||
s/:AND:/&/g;
|
||||
s/:LAND:/&&/g;
|
||||
s/CPSR/cpsr/;
|
||||
s/SPSR/spsr/;
|
||||
s/ALIGN$/.balign 4/;
|
||||
s/ALIGN\s+([0-9x]+)$/.balign $1/;
|
||||
s/psr_cxsf/psr_all/;
|
||||
s/LTORG/.ltorg/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;
|
||||
|
||||
# {PC} + 0xdeadfeed --> . + 0xdeadfeed
|
||||
s/\{PC\} \+/ \. +/;
|
||||
|
||||
# Single hex constant on the line !
|
||||
#
|
||||
# >>> NOTE <<<
|
||||
# Double-precision floats in gcc are always mixed-endian, which means
|
||||
# bytes in two words are little-endian, but words are big-endian.
|
||||
# So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address
|
||||
# and 0xfeed0000 at high address.
|
||||
#
|
||||
s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;
|
||||
# Only decimal constants on the line, no hex !
|
||||
s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;
|
||||
|
||||
# Single hex constant on the line !
|
||||
# s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;
|
||||
# Only decimal constants on the line, no hex !
|
||||
# s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;
|
||||
s/\bDCFS[ \t]+0x/.word 0x/;
|
||||
s/\bDCFS\b/.float/;
|
||||
|
||||
s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;
|
||||
s/\bDCD\b/.word/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;
|
||||
s/\bDCW\b/.short/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;
|
||||
s/\bDCB\b/.byte/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;
|
||||
s/^[A-Za-z_\.]\w+/$&:/;
|
||||
s/^(\d+)/$1:/;
|
||||
s/\%(\d+)/$1b_or_f/;
|
||||
s/\%[Bb](\d+)/$1b/;
|
||||
s/\%[Ff](\d+)/$1f/;
|
||||
s/\%[Ff][Tt](\d+)/$1f/;
|
||||
s/&([\dA-Fa-f]+)/0x$1/;
|
||||
if ( /\b2_[01]+\b/ ) {
|
||||
s/\b2_([01]+)\b/conv$1&&&&/g;
|
||||
while ( /[01][01][01][01]&&&&/ ) {
|
||||
s/0000&&&&/&&&&0/g;
|
||||
s/0001&&&&/&&&&1/g;
|
||||
s/0010&&&&/&&&&2/g;
|
||||
s/0011&&&&/&&&&3/g;
|
||||
s/0100&&&&/&&&&4/g;
|
||||
s/0101&&&&/&&&&5/g;
|
||||
s/0110&&&&/&&&&6/g;
|
||||
s/0111&&&&/&&&&7/g;
|
||||
s/1000&&&&/&&&&8/g;
|
||||
s/1001&&&&/&&&&9/g;
|
||||
s/1010&&&&/&&&&A/g;
|
||||
s/1011&&&&/&&&&B/g;
|
||||
s/1100&&&&/&&&&C/g;
|
||||
s/1101&&&&/&&&&D/g;
|
||||
s/1110&&&&/&&&&E/g;
|
||||
s/1111&&&&/&&&&F/g;
|
||||
}
|
||||
s/000&&&&/&&&&0/g;
|
||||
s/001&&&&/&&&&1/g;
|
||||
s/010&&&&/&&&&2/g;
|
||||
s/011&&&&/&&&&3/g;
|
||||
s/100&&&&/&&&&4/g;
|
||||
s/101&&&&/&&&&5/g;
|
||||
s/110&&&&/&&&&6/g;
|
||||
s/111&&&&/&&&&7/g;
|
||||
s/00&&&&/&&&&0/g;
|
||||
s/01&&&&/&&&&1/g;
|
||||
s/10&&&&/&&&&2/g;
|
||||
s/11&&&&/&&&&3/g;
|
||||
s/0&&&&/&&&&0/g;
|
||||
s/1&&&&/&&&&1/g;
|
||||
s/conv&&&&/0x/g;
|
||||
}
|
||||
|
||||
if ( /commandline/)
|
||||
{
|
||||
if( /-bigend/)
|
||||
{
|
||||
$bigend=1;
|
||||
}
|
||||
}
|
||||
|
||||
if ( /\bDCDU\b/ )
|
||||
{
|
||||
my $cmd=$_;
|
||||
my $value;
|
||||
my $w1;
|
||||
my $w2;
|
||||
my $w3;
|
||||
my $w4;
|
||||
|
||||
s/\s+DCDU\b/@ $&/;
|
||||
|
||||
$cmd =~ /\bDCDU\b\s+0x(\d+)/;
|
||||
$value = $1;
|
||||
$value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;
|
||||
$w1 = $1;
|
||||
$w2 = $2;
|
||||
$w3 = $3;
|
||||
$w4 = $4;
|
||||
|
||||
if( $bigend ne "")
|
||||
{
|
||||
# big endian
|
||||
|
||||
print " .byte 0x".$w1;
|
||||
print " .byte 0x".$w2;
|
||||
print " .byte 0x".$w3;
|
||||
print " .byte 0x".$w4;
|
||||
}
|
||||
else
|
||||
{
|
||||
# little endian
|
||||
|
||||
print " .byte 0x".$w4;
|
||||
print " .byte 0x".$w3;
|
||||
print " .byte 0x".$w2;
|
||||
print " .byte 0x".$w1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
if ( /\badrl\b/i )
|
||||
{
|
||||
s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;
|
||||
$addPadding = 1;
|
||||
}
|
||||
s/\bEND\b/@ END/;
|
||||
} continue {
|
||||
printf ("%s", $_) if $printit;
|
||||
if ($addPadding != 0)
|
||||
{
|
||||
printf (" mov r0,r0\n");
|
||||
$addPadding = 0;
|
||||
}
|
||||
}
|
||||
|
32
media/libtheora/lib/arm/armbits.h
Normal file
32
media/libtheora/lib/arm/armbits.h
Normal file
@ -0,0 +1,32 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
#if !defined(_arm_armbits_H)
|
||||
# define _arm_armbits_H (1)
|
||||
# include "../bitpack.h"
|
||||
# include "armcpu.h"
|
||||
|
||||
# if defined(OC_ARM_ASM)
|
||||
# define oc_pack_read oc_pack_read_arm
|
||||
# define oc_pack_read1 oc_pack_read1_arm
|
||||
# define oc_huff_token_decode oc_huff_token_decode_arm
|
||||
# endif
|
||||
|
||||
long oc_pack_read_arm(oc_pack_buf *_b,int _bits);
|
||||
int oc_pack_read1_arm(oc_pack_buf *_b);
|
||||
int oc_huff_token_decode_arm(oc_pack_buf *_b,const ogg_int16_t *_tree);
|
||||
|
||||
#endif
|
230
media/libtheora/lib/arm/armbits.s
Normal file
230
media/libtheora/lib/arm/armbits.s
Normal file
@ -0,0 +1,230 @@
|
||||
;********************************************************************
|
||||
;* *
|
||||
;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
;* *
|
||||
;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
|
||||
;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
;* *
|
||||
;********************************************************************
|
||||
;
|
||||
; function:
|
||||
; last mod: $Id: armbits.s 17481 2010-10-03 22:49:42Z tterribe $
|
||||
;
|
||||
;********************************************************************
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
EXPORT oc_pack_read_arm
|
||||
EXPORT oc_pack_read1_arm
|
||||
EXPORT oc_huff_token_decode_arm
|
||||
|
||||
oc_pack_read1_arm PROC
|
||||
; r0 = oc_pack_buf *_b
|
||||
ADD r12,r0,#8
|
||||
LDMIA r12,{r2,r3} ; r2 = window
|
||||
; Stall... ; r3 = available
|
||||
; Stall...
|
||||
SUBS r3,r3,#1 ; r3 = available-1, available<1 => LT
|
||||
BLT oc_pack_read1_refill
|
||||
MOV r0,r2,LSR #31 ; r0 = window>>31
|
||||
MOV r2,r2,LSL #1 ; r2 = window<<=1
|
||||
STMIA r12,{r2,r3} ; window = r2
|
||||
; available = r3
|
||||
MOV PC,r14
|
||||
ENDP
|
||||
|
||||
oc_pack_read_arm PROC
|
||||
; r0 = oc_pack_buf *_b
|
||||
; r1 = int _bits
|
||||
ADD r12,r0,#8
|
||||
LDMIA r12,{r2,r3} ; r2 = window
|
||||
; Stall... ; r3 = available
|
||||
; Stall...
|
||||
SUBS r3,r3,r1 ; r3 = available-_bits, available<_bits => LT
|
||||
BLT oc_pack_read_refill
|
||||
RSB r0,r1,#32 ; r0 = 32-_bits
|
||||
MOV r0,r2,LSR r0 ; r0 = window>>32-_bits
|
||||
MOV r2,r2,LSL r1 ; r2 = window<<=_bits
|
||||
STMIA r12,{r2,r3} ; window = r2
|
||||
; available = r3
|
||||
MOV PC,r14
|
||||
|
||||
; We need to refill window.
|
||||
oc_pack_read1_refill
|
||||
MOV r1,#1
|
||||
oc_pack_read_refill
|
||||
STMFD r13!,{r10,r11,r14}
|
||||
LDMIA r0,{r10,r11} ; r10 = stop
|
||||
; r11 = ptr
|
||||
RSB r0,r1,#32 ; r0 = 32-_bits
|
||||
RSB r3,r3,r0 ; r3 = 32-available
|
||||
; We can use unsigned compares for both the pointers and for available
|
||||
; (allowing us to chain condition codes) because available will never be
|
||||
; larger than 32 (or we wouldn't be here), and thus 32-available will never be
|
||||
; negative.
|
||||
CMP r10,r11 ; ptr<stop => HI
|
||||
CMPHI r3,#7 ; available<=24 => HI
|
||||
LDRHIB r14,[r11],#1 ; r14 = *ptr++
|
||||
SUBHI r3,#8 ; available += 8
|
||||
; (HI) Stall...
|
||||
ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available
|
||||
CMPHI r10,r11 ; ptr<stop => HI
|
||||
CMPHI r3,#7 ; available<=24 => HI
|
||||
LDRHIB r14,[r11],#1 ; r14 = *ptr++
|
||||
SUBHI r3,#8 ; available += 8
|
||||
; (HI) Stall...
|
||||
ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available
|
||||
CMPHI r10,r11 ; ptr<stop => HI
|
||||
CMPHI r3,#7 ; available<=24 => HI
|
||||
LDRHIB r14,[r11],#1 ; r14 = *ptr++
|
||||
SUBHI r3,#8 ; available += 8
|
||||
; (HI) Stall...
|
||||
ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available
|
||||
CMPHI r10,r11 ; ptr<stop => HI
|
||||
CMPHI r3,#7 ; available<=24 => HI
|
||||
LDRHIB r14,[r11],#1 ; r14 = *ptr++
|
||||
SUBHI r3,#8 ; available += 8
|
||||
; (HI) Stall...
|
||||
ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available
|
||||
SUBS r3,r0,r3 ; r3 = available-=_bits, available<bits => GT
|
||||
BLT oc_pack_read_refill_last
|
||||
MOV r0,r2,LSR r0 ; r0 = window>>32-_bits
|
||||
MOV r2,r2,LSL r1 ; r2 = window<<=_bits
|
||||
STR r11,[r12,#-4] ; ptr = r11
|
||||
STMIA r12,{r2,r3} ; window = r2
|
||||
; available = r3
|
||||
LDMFD r13!,{r10,r11,PC}
|
||||
|
||||
; Either we wanted to read more than 24 bits and didn't have enough room to
|
||||
; stuff the last byte into the window, or we hit the end of the packet.
|
||||
oc_pack_read_refill_last
|
||||
CMP r11,r10 ; ptr<stop => LO
|
||||
; If we didn't hit the end of the packet, then pull enough of the next byte to
|
||||
; to fill up the window.
|
||||
LDRLOB r14,[r11] ; (LO) r14 = *ptr
|
||||
; Otherwise, set the EOF flag and pretend we have lots of available bits.
|
||||
MOVHS r14,#1 ; (HS) r14 = 1
|
||||
ADDLO r10,r3,r1 ; (LO) r10 = available
|
||||
STRHS r14,[r12,#8] ; (HS) eof = 1
|
||||
ANDLO r10,r10,#7 ; (LO) r10 = available&7
|
||||
MOVHS r3,#1<<30 ; (HS) available = OC_LOTS_OF_BITS
|
||||
ORRLO r2,r14,LSL r10 ; (LO) r2 = window|=*ptr>>(available&7)
|
||||
MOV r0,r2,LSR r0 ; r0 = window>>32-_bits
|
||||
MOV r2,r2,LSL r1 ; r2 = window<<=_bits
|
||||
STR r11,[r12,#-4] ; ptr = r11
|
||||
STMIA r12,{r2,r3} ; window = r2
|
||||
; available = r3
|
||||
LDMFD r13!,{r10,r11,PC}
|
||||
ENDP
|
||||
|
||||
|
||||
|
||||
oc_huff_token_decode_arm PROC
|
||||
; r0 = oc_pack_buf *_b
|
||||
; r1 = const ogg_int16_t *_tree
|
||||
STMFD r13!,{r4,r5,r10,r14}
|
||||
LDRSH r10,[r1] ; r10 = n=_tree[0]
|
||||
LDMIA r0,{r2-r5} ; r2 = stop
|
||||
; Stall... ; r3 = ptr
|
||||
; Stall... ; r4 = window
|
||||
; r5 = available
|
||||
CMP r10,r5 ; n>available => GT
|
||||
BGT oc_huff_token_decode_refill0
|
||||
RSB r14,r10,#32 ; r14 = 32-n
|
||||
MOV r14,r4,LSR r14 ; r14 = bits=window>>32-n
|
||||
ADD r14,r1,r14,LSL #1 ; r14 = _tree+bits
|
||||
LDRSH r12,[r14,#2] ; r12 = node=_tree[1+bits]
|
||||
; Stall...
|
||||
; Stall...
|
||||
RSBS r14,r12,#0 ; r14 = -node, node>0 => MI
|
||||
BMI oc_huff_token_decode_continue
|
||||
MOV r10,r14,LSR #8 ; r10 = n=node>>8
|
||||
MOV r4,r4,LSL r10 ; r4 = window<<=n
|
||||
SUB r5,r10 ; r5 = available-=n
|
||||
STMIB r0,{r3-r5} ; ptr = r3
|
||||
; window = r4
|
||||
; available = r5
|
||||
AND r0,r14,#255 ; r0 = node&255
|
||||
LDMFD r13!,{r4,r5,r10,pc}
|
||||
|
||||
; The first tree node wasn't enough to reach a leaf, read another
|
||||
oc_huff_token_decode_continue
|
||||
ADD r12,r1,r12,LSL #1 ; r12 = _tree+node
|
||||
MOV r4,r4,LSL r10 ; r4 = window<<=n
|
||||
SUB r5,r5,r10 ; r5 = available-=n
|
||||
LDRSH r10,[r12],#2 ; r10 = n=_tree[node]
|
||||
; Stall... ; r12 = _tree+node+1
|
||||
; Stall...
|
||||
CMP r10,r5 ; n>available => GT
|
||||
BGT oc_huff_token_decode_refill
|
||||
RSB r14,r10,#32 ; r14 = 32-n
|
||||
MOV r14,r4,LSR r14 ; r14 = bits=window>>32-n
|
||||
ADD r12,r12,r14 ;
|
||||
LDRSH r12,[r12,r14] ; r12 = node=_tree[node+1+bits]
|
||||
; Stall...
|
||||
; Stall...
|
||||
RSBS r14,r12,#0 ; r14 = -node, node>0 => MI
|
||||
BMI oc_huff_token_decode_continue
|
||||
MOV r10,r14,LSR #8 ; r10 = n=node>>8
|
||||
MOV r4,r4,LSL r10 ; r4 = window<<=n
|
||||
SUB r5,r10 ; r5 = available-=n
|
||||
STMIB r0,{r3-r5} ; ptr = r3
|
||||
; window = r4
|
||||
; available = r5
|
||||
AND r0,r14,#255 ; r0 = node&255
|
||||
LDMFD r13!,{r4,r5,r10,pc}
|
||||
|
||||
oc_huff_token_decode_refill0
|
||||
ADD r12,r1,#2 ; r12 = _tree+1
|
||||
oc_huff_token_decode_refill
|
||||
; We can't possibly need more than 15 bits, so available must be <= 15.
|
||||
; Therefore we can load at least two bytes without checking it.
|
||||
CMP r2,r3 ; ptr<stop => HI
|
||||
LDRHIB r14,[r3],#1 ; r14 = *ptr++
|
||||
RSBHI r5,r5,#24 ; (HI) available = 32-(available+=8)
|
||||
RSBLS r5,r5,#32 ; (LS) r5 = 32-available
|
||||
ORRHI r4,r14,LSL r5 ; r4 = window|=r14<<32-available
|
||||
CMPHI r2,r3 ; ptr<stop => HI
|
||||
LDRHIB r14,[r3],#1 ; r14 = *ptr++
|
||||
SUBHI r5,#8 ; available += 8
|
||||
; (HI) Stall...
|
||||
ORRHI r4,r14,LSL r5 ; r4 = window|=r14<<32-available
|
||||
; We can use unsigned compares for both the pointers and for available
|
||||
; (allowing us to chain condition codes) because available will never be
|
||||
; larger than 32 (or we wouldn't be here), and thus 32-available will never be
|
||||
; negative.
|
||||
CMPHI r2,r3 ; ptr<stop => HI
|
||||
CMPHI r5,#7 ; available<=24 => HI
|
||||
LDRHIB r14,[r3],#1 ; r14 = *ptr++
|
||||
SUBHI r5,#8 ; available += 8
|
||||
; (HI) Stall...
|
||||
ORRHI r4,r14,LSL r5 ; r4 = window|=r14<<32-available
|
||||
CMP r2,r3 ; ptr<stop => HI
|
||||
MOVLS r5,#-1<<30 ; (LS) available = OC_LOTS_OF_BITS+32
|
||||
CMPHI r5,#7 ; (HI) available<=24 => HI
|
||||
LDRHIB r14,[r3],#1 ; (HI) r14 = *ptr++
|
||||
SUBHI r5,#8 ; (HI) available += 8
|
||||
; (HI) Stall...
|
||||
ORRHI r4,r14,LSL r5 ; (HI) r4 = window|=r14<<32-available
|
||||
RSB r14,r10,#32 ; r14 = 32-n
|
||||
MOV r14,r4,LSR r14 ; r14 = bits=window>>32-n
|
||||
ADD r12,r12,r14 ;
|
||||
LDRSH r12,[r12,r14] ; r12 = node=_tree[node+1+bits]
|
||||
RSB r5,r5,#32 ; r5 = available
|
||||
; Stall...
|
||||
RSBS r14,r12,#0 ; r14 = -node, node>0 => MI
|
||||
BMI oc_huff_token_decode_continue
|
||||
MOV r10,r14,LSR #8 ; r10 = n=node>>8
|
||||
MOV r4,r4,LSL r10 ; r4 = window<<=n
|
||||
SUB r5,r10 ; r5 = available-=n
|
||||
STMIB r0,{r3-r5} ; ptr = r3
|
||||
; window = r4
|
||||
; available = r5
|
||||
AND r0,r14,#255 ; r0 = node&255
|
||||
LDMFD r13!,{r4,r5,r10,pc}
|
||||
ENDP
|
||||
|
||||
END
|
116
media/libtheora/lib/arm/armcpu.c
Normal file
116
media/libtheora/lib/arm/armcpu.c
Normal file
@ -0,0 +1,116 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
CPU capability detection for ARM processors.
|
||||
|
||||
function:
|
||||
last mod: $Id: cpu.c 17344 2010-07-21 01:42:18Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include "armcpu.h"
|
||||
|
||||
#if !defined(OC_ARM_ASM)|| \
|
||||
!defined(OC_ARM_ASM_EDSP)&&!defined(OC_ARM_ASM_ARMV6)&& \
|
||||
!defined(OC_ARM_ASM_NEON)
|
||||
ogg_uint32_t oc_cpu_flags_get(void){
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(_MSC_VER)
|
||||
/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# define WIN32_EXTRA_LEAN
|
||||
# include <windows.h>
|
||||
|
||||
ogg_uint32_t oc_cpu_flags_get(void){
|
||||
ogg_uint32_t flags;
|
||||
flags=0;
|
||||
/*MSVC has no inline __asm support for ARM, but it does let you __emit
|
||||
instructions via their assembled hex code.
|
||||
All of these instructions should be essentially nops.*/
|
||||
# if defined(OC_ARM_ASM_EDSP)
|
||||
__try{
|
||||
/*PLD [r13]*/
|
||||
__emit(0xF5DDF000);
|
||||
flags|=OC_CPU_ARM_EDSP;
|
||||
}
|
||||
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
|
||||
/*Ignore exception.*/
|
||||
}
|
||||
# if defined(OC_ARM_ASM_MEDIA)
|
||||
__try{
|
||||
/*SHADD8 r3,r3,r3*/
|
||||
__emit(0xE6333F93);
|
||||
flags|=OC_CPU_ARM_MEDIA;
|
||||
}
|
||||
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
|
||||
/*Ignore exception.*/
|
||||
}
|
||||
# if defined(OC_ARM_ASM_NEON)
|
||||
__try{
|
||||
/*VORR q0,q0,q0*/
|
||||
__emit(0xF2200150);
|
||||
flags|=OC_CPU_ARM_NEON;
|
||||
}
|
||||
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
|
||||
/*Ignore exception.*/
|
||||
}
|
||||
# endif
|
||||
# endif
|
||||
# endif
|
||||
return flags;
|
||||
}
|
||||
|
||||
#elif defined(__linux__)
|
||||
# include <stdio.h>
|
||||
# include <stdlib.h>
|
||||
# include <string.h>
|
||||
|
||||
ogg_uint32_t oc_cpu_flags_get(void){
|
||||
ogg_uint32_t flags;
|
||||
FILE *fin;
|
||||
flags=0;
|
||||
/*Reading /proc/self/auxv would be easier, but that doesn't work reliably on
|
||||
Android.
|
||||
This also means that detection will fail in Scratchbox.*/
|
||||
fin=fopen("/proc/cpuinfo","r");
|
||||
if(fin!=NULL){
|
||||
/*512 should be enough for anybody (it's even enough for all the flags that
|
||||
x86 has accumulated... so far).*/
|
||||
char buf[512];
|
||||
while(fgets(buf,511,fin)!=NULL){
|
||||
if(memcmp(buf,"Features",8)==0){
|
||||
char *p;
|
||||
p=strstr(buf," edsp");
|
||||
if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_EDSP;
|
||||
p=strstr(buf," neon");
|
||||
if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_NEON;
|
||||
}
|
||||
if(memcmp(buf,"CPU architecture:",17)==0){
|
||||
int version;
|
||||
version=atoi(buf+17);
|
||||
if(version>=6)flags|=OC_CPU_ARM_MEDIA;
|
||||
}
|
||||
}
|
||||
fclose(fin);
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
|
||||
#else
|
||||
/*The feature registers which can tell us what the processor supports are
|
||||
accessible in priveleged modes only, so we can't have a general user-space
|
||||
detection method like on x86.*/
|
||||
# error "Configured to use ARM asm but no CPU detection method available for " \
|
||||
"your platform. Reconfigure with --disable-asm (or send patches)."
|
||||
#endif
|
29
media/libtheora/lib/arm/armcpu.h
Normal file
29
media/libtheora/lib/arm/armcpu.h
Normal file
@ -0,0 +1,29 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
function:
|
||||
last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_arm_armcpu_H)
|
||||
# define _arm_armcpu_H (1)
|
||||
#include "../internal.h"
|
||||
|
||||
/*"Parallel instructions" from ARM v6 and above.*/
|
||||
#define OC_CPU_ARM_MEDIA (1<<24)
|
||||
/*Flags chosen to match arch/arm/include/asm/hwcap.h in the Linux kernel.*/
|
||||
#define OC_CPU_ARM_EDSP (1<<7)
|
||||
#define OC_CPU_ARM_NEON (1<<12)
|
||||
|
||||
ogg_uint32_t oc_cpu_flags_get(void);
|
||||
|
||||
#endif
|
656
media/libtheora/lib/arm/armfrag.s
Normal file
656
media/libtheora/lib/arm/armfrag.s
Normal file
@ -0,0 +1,656 @@
|
||||
;********************************************************************
|
||||
;* *
|
||||
;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
;* *
|
||||
;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
|
||||
;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
;* *
|
||||
;********************************************************************
|
||||
; Original implementation:
|
||||
; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
|
||||
; last mod: $Id: armfrag.s 17481 2010-10-03 22:49:42Z tterribe $
|
||||
;********************************************************************
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
GET armopts.s
|
||||
|
||||
; Vanilla ARM v4 versions
|
||||
EXPORT oc_frag_copy_list_arm
|
||||
EXPORT oc_frag_recon_intra_arm
|
||||
EXPORT oc_frag_recon_inter_arm
|
||||
EXPORT oc_frag_recon_inter2_arm
|
||||
|
||||
oc_frag_copy_list_arm PROC
|
||||
; r0 = _dst_frame
|
||||
; r1 = _src_frame
|
||||
; r2 = _ystride
|
||||
; r3 = _fragis
|
||||
; <> = _nfragis
|
||||
; <> = _frag_buf_offs
|
||||
LDR r12,[r13] ; r12 = _nfragis
|
||||
STMFD r13!,{r4-r6,r11,r14}
|
||||
SUBS r12, r12, #1
|
||||
LDR r4,[r3],#4 ; r4 = _fragis[fragii]
|
||||
LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs
|
||||
BLT ofcl_arm_end
|
||||
SUB r2, r2, #4
|
||||
ofcl_arm_lp
|
||||
LDR r11,[r14,r4,LSL #2] ; r11 = _frag_buf_offs[_fragis[fragii]]
|
||||
SUBS r12, r12, #1
|
||||
; Stall (on XScale)
|
||||
ADD r4, r1, r11 ; r4 = _src_frame+frag_buf_off
|
||||
LDR r6, [r4], #4
|
||||
ADD r11,r0, r11 ; r11 = _dst_frame+frag_buf_off
|
||||
LDR r5, [r4], r2
|
||||
STR r6, [r11],#4
|
||||
LDR r6, [r4], #4
|
||||
STR r5, [r11],r2
|
||||
LDR r5, [r4], r2
|
||||
STR r6, [r11],#4
|
||||
LDR r6, [r4], #4
|
||||
STR r5, [r11],r2
|
||||
LDR r5, [r4], r2
|
||||
STR r6, [r11],#4
|
||||
LDR r6, [r4], #4
|
||||
STR r5, [r11],r2
|
||||
LDR r5, [r4], r2
|
||||
STR r6, [r11],#4
|
||||
LDR r6, [r4], #4
|
||||
STR r5, [r11],r2
|
||||
LDR r5, [r4], r2
|
||||
STR r6, [r11],#4
|
||||
LDR r6, [r4], #4
|
||||
STR r5, [r11],r2
|
||||
LDR r5, [r4], r2
|
||||
STR r6, [r11],#4
|
||||
LDR r6, [r4], #4
|
||||
STR r5, [r11],r2
|
||||
LDR r5, [r4], r2
|
||||
STR r6, [r11],#4
|
||||
LDR r6, [r4], #4
|
||||
STR r5, [r11],r2
|
||||
LDR r5, [r4]
|
||||
LDRGE r4,[r3],#4 ; r4 = _fragis[fragii]
|
||||
STR r6, [r11],#4
|
||||
STR r5, [r11]
|
||||
BGE ofcl_arm_lp
|
||||
ofcl_arm_end
|
||||
LDMFD r13!,{r4-r6,r11,PC}
|
||||
oc_frag_recon_intra_arm
|
||||
; r0 = unsigned char *_dst
|
||||
; r1 = int _ystride
|
||||
; r2 = const ogg_int16_t _residue[64]
|
||||
STMFD r13!,{r4,r5,r14}
|
||||
MOV r14,#8
|
||||
MOV r5, #255
|
||||
SUB r1, r1, #7
|
||||
ofrintra_lp_arm
|
||||
LDRSH r3, [r2], #2
|
||||
LDRSH r4, [r2], #2
|
||||
LDRSH r12,[r2], #2
|
||||
ADDS r3, r3, #128
|
||||
CMPGT r5, r3
|
||||
EORLT r3, r5, r3, ASR #32
|
||||
STRB r3, [r0], #1
|
||||
ADDS r4, r4, #128
|
||||
CMPGT r5, r4
|
||||
EORLT r4, r5, r4, ASR #32
|
||||
LDRSH r3, [r2], #2
|
||||
STRB r4, [r0], #1
|
||||
ADDS r12,r12,#128
|
||||
CMPGT r5, r12
|
||||
EORLT r12,r5, r12,ASR #32
|
||||
LDRSH r4, [r2], #2
|
||||
STRB r12,[r0], #1
|
||||
ADDS r3, r3, #128
|
||||
CMPGT r5, r3
|
||||
EORLT r3, r5, r3, ASR #32
|
||||
LDRSH r12,[r2], #2
|
||||
STRB r3, [r0], #1
|
||||
ADDS r4, r4, #128
|
||||
CMPGT r5, r4
|
||||
EORLT r4, r5, r4, ASR #32
|
||||
LDRSH r3, [r2], #2
|
||||
STRB r4, [r0], #1
|
||||
ADDS r12,r12,#128
|
||||
CMPGT r5, r12
|
||||
EORLT r12,r5, r12,ASR #32
|
||||
LDRSH r4, [r2], #2
|
||||
STRB r12,[r0], #1
|
||||
ADDS r3, r3, #128
|
||||
CMPGT r5, r3
|
||||
EORLT r3, r5, r3, ASR #32
|
||||
STRB r3, [r0], #1
|
||||
ADDS r4, r4, #128
|
||||
CMPGT r5, r4
|
||||
EORLT r4, r5, r4, ASR #32
|
||||
STRB r4, [r0], r1
|
||||
SUBS r14,r14,#1
|
||||
BGT ofrintra_lp_arm
|
||||
LDMFD r13!,{r4,r5,PC}
|
||||
ENDP
|
||||
|
||||
oc_frag_recon_inter_arm PROC
|
||||
; r0 = unsigned char *dst
|
||||
; r1 = const unsigned char *src
|
||||
; r2 = int ystride
|
||||
; r3 = const ogg_int16_t residue[64]
|
||||
STMFD r13!,{r5,r9-r11,r14}
|
||||
MOV r9, #8
|
||||
MOV r5, #255
|
||||
SUB r2, r2, #7
|
||||
ofrinter_lp_arm
|
||||
LDRSH r12,[r3], #2
|
||||
LDRB r14,[r1], #1
|
||||
LDRSH r11,[r3], #2
|
||||
LDRB r10,[r1], #1
|
||||
ADDS r12,r12,r14
|
||||
CMPGT r5, r12
|
||||
EORLT r12,r5, r12,ASR #32
|
||||
STRB r12,[r0], #1
|
||||
ADDS r11,r11,r10
|
||||
CMPGT r5, r11
|
||||
LDRSH r12,[r3], #2
|
||||
LDRB r14,[r1], #1
|
||||
EORLT r11,r5, r11,ASR #32
|
||||
STRB r11,[r0], #1
|
||||
ADDS r12,r12,r14
|
||||
CMPGT r5, r12
|
||||
LDRSH r11,[r3], #2
|
||||
LDRB r10,[r1], #1
|
||||
EORLT r12,r5, r12,ASR #32
|
||||
STRB r12,[r0], #1
|
||||
ADDS r11,r11,r10
|
||||
CMPGT r5, r11
|
||||
LDRSH r12,[r3], #2
|
||||
LDRB r14,[r1], #1
|
||||
EORLT r11,r5, r11,ASR #32
|
||||
STRB r11,[r0], #1
|
||||
ADDS r12,r12,r14
|
||||
CMPGT r5, r12
|
||||
LDRSH r11,[r3], #2
|
||||
LDRB r10,[r1], #1
|
||||
EORLT r12,r5, r12,ASR #32
|
||||
STRB r12,[r0], #1
|
||||
ADDS r11,r11,r10
|
||||
CMPGT r5, r11
|
||||
LDRSH r12,[r3], #2
|
||||
LDRB r14,[r1], #1
|
||||
EORLT r11,r5, r11,ASR #32
|
||||
STRB r11,[r0], #1
|
||||
ADDS r12,r12,r14
|
||||
CMPGT r5, r12
|
||||
LDRSH r11,[r3], #2
|
||||
LDRB r10,[r1], r2
|
||||
EORLT r12,r5, r12,ASR #32
|
||||
STRB r12,[r0], #1
|
||||
ADDS r11,r11,r10
|
||||
CMPGT r5, r11
|
||||
EORLT r11,r5, r11,ASR #32
|
||||
STRB r11,[r0], r2
|
||||
SUBS r9, r9, #1
|
||||
BGT ofrinter_lp_arm
|
||||
LDMFD r13!,{r5,r9-r11,PC}
|
||||
ENDP
|
||||
|
||||
oc_frag_recon_inter2_arm PROC
|
||||
; r0 = unsigned char *dst
|
||||
; r1 = const unsigned char *src1
|
||||
; r2 = const unsigned char *src2
|
||||
; r3 = int ystride
|
||||
LDR r12,[r13]
|
||||
; r12= const ogg_int16_t residue[64]
|
||||
STMFD r13!,{r4-r8,r14}
|
||||
MOV r14,#8
|
||||
MOV r8, #255
|
||||
SUB r3, r3, #7
|
||||
ofrinter2_lp_arm
|
||||
LDRB r5, [r1], #1
|
||||
LDRB r6, [r2], #1
|
||||
LDRSH r4, [r12],#2
|
||||
LDRB r7, [r1], #1
|
||||
ADD r5, r5, r6
|
||||
ADDS r5, r4, r5, LSR #1
|
||||
CMPGT r8, r5
|
||||
LDRB r6, [r2], #1
|
||||
LDRSH r4, [r12],#2
|
||||
EORLT r5, r8, r5, ASR #32
|
||||
STRB r5, [r0], #1
|
||||
ADD r7, r7, r6
|
||||
ADDS r7, r4, r7, LSR #1
|
||||
CMPGT r8, r7
|
||||
LDRB r5, [r1], #1
|
||||
LDRB r6, [r2], #1
|
||||
LDRSH r4, [r12],#2
|
||||
EORLT r7, r8, r7, ASR #32
|
||||
STRB r7, [r0], #1
|
||||
ADD r5, r5, r6
|
||||
ADDS r5, r4, r5, LSR #1
|
||||
CMPGT r8, r5
|
||||
LDRB r7, [r1], #1
|
||||
LDRB r6, [r2], #1
|
||||
LDRSH r4, [r12],#2
|
||||
EORLT r5, r8, r5, ASR #32
|
||||
STRB r5, [r0], #1
|
||||
ADD r7, r7, r6
|
||||
ADDS r7, r4, r7, LSR #1
|
||||
CMPGT r8, r7
|
||||
LDRB r5, [r1], #1
|
||||
LDRB r6, [r2], #1
|
||||
LDRSH r4, [r12],#2
|
||||
EORLT r7, r8, r7, ASR #32
|
||||
STRB r7, [r0], #1
|
||||
ADD r5, r5, r6
|
||||
ADDS r5, r4, r5, LSR #1
|
||||
CMPGT r8, r5
|
||||
LDRB r7, [r1], #1
|
||||
LDRB r6, [r2], #1
|
||||
LDRSH r4, [r12],#2
|
||||
EORLT r5, r8, r5, ASR #32
|
||||
STRB r5, [r0], #1
|
||||
ADD r7, r7, r6
|
||||
ADDS r7, r4, r7, LSR #1
|
||||
CMPGT r8, r7
|
||||
LDRB r5, [r1], #1
|
||||
LDRB r6, [r2], #1
|
||||
LDRSH r4, [r12],#2
|
||||
EORLT r7, r8, r7, ASR #32
|
||||
STRB r7, [r0], #1
|
||||
ADD r5, r5, r6
|
||||
ADDS r5, r4, r5, LSR #1
|
||||
CMPGT r8, r5
|
||||
LDRB r7, [r1], r3
|
||||
LDRB r6, [r2], r3
|
||||
LDRSH r4, [r12],#2
|
||||
EORLT r5, r8, r5, ASR #32
|
||||
STRB r5, [r0], #1
|
||||
ADD r7, r7, r6
|
||||
ADDS r7, r4, r7, LSR #1
|
||||
CMPGT r8, r7
|
||||
EORLT r7, r8, r7, ASR #32
|
||||
STRB r7, [r0], r3
|
||||
SUBS r14,r14,#1
|
||||
BGT ofrinter2_lp_arm
|
||||
LDMFD r13!,{r4-r8,PC}
|
||||
ENDP
|
||||
|
||||
[ OC_ARM_ASM_EDSP
|
||||
EXPORT oc_frag_copy_list_edsp
|
||||
|
||||
oc_frag_copy_list_edsp PROC
|
||||
; r0 = _dst_frame
|
||||
; r1 = _src_frame
|
||||
; r2 = _ystride
|
||||
; r3 = _fragis
|
||||
; <> = _nfragis
|
||||
; <> = _frag_buf_offs
|
||||
LDR r12,[r13] ; r12 = _nfragis
|
||||
STMFD r13!,{r4-r11,r14}
|
||||
SUBS r12, r12, #1
|
||||
LDRGE r5, [r3],#4 ; r5 = _fragis[fragii]
|
||||
LDRGE r14,[r13,#4*10] ; r14 = _frag_buf_offs
|
||||
BLT ofcl_edsp_end
|
||||
ofcl_edsp_lp
|
||||
MOV r4, r1
|
||||
LDR r5, [r14,r5, LSL #2] ; r5 = _frag_buf_offs[_fragis[fragii]]
|
||||
SUBS r12, r12, #1
|
||||
; Stall (on XScale)
|
||||
LDRD r6, [r4, r5]! ; r4 = _src_frame+frag_buf_off
|
||||
LDRD r8, [r4, r2]!
|
||||
; Stall
|
||||
STRD r6, [r5, r0]! ; r5 = _dst_frame+frag_buf_off
|
||||
STRD r8, [r5, r2]!
|
||||
; Stall
|
||||
LDRD r6, [r4, r2]! ; On Xscale at least, doing 3 consecutive
|
||||
LDRD r8, [r4, r2]! ; loads causes a stall, but that's no worse
|
||||
LDRD r10,[r4, r2]! ; than us only doing 2, and having to do
|
||||
; another pair of LDRD/STRD later on.
|
||||
; Stall
|
||||
STRD r6, [r5, r2]!
|
||||
STRD r8, [r5, r2]!
|
||||
STRD r10,[r5, r2]!
|
||||
LDRD r6, [r4, r2]!
|
||||
LDRD r8, [r4, r2]!
|
||||
LDRD r10,[r4, r2]!
|
||||
STRD r6, [r5, r2]!
|
||||
STRD r8, [r5, r2]!
|
||||
STRD r10,[r5, r2]!
|
||||
LDRGE r5, [r3],#4 ; r5 = _fragis[fragii]
|
||||
BGE ofcl_edsp_lp
|
||||
ofcl_edsp_end
|
||||
LDMFD r13!,{r4-r11,PC}
|
||||
ENDP
|
||||
]
|
||||
|
||||
[ OC_ARM_ASM_MEDIA
|
||||
EXPORT oc_frag_recon_intra_v6
|
||||
EXPORT oc_frag_recon_inter_v6
|
||||
EXPORT oc_frag_recon_inter2_v6
|
||||
|
||||
oc_frag_recon_intra_v6 PROC
|
||||
; r0 = unsigned char *_dst
|
||||
; r1 = int _ystride
|
||||
; r2 = const ogg_int16_t _residue[64]
|
||||
STMFD r13!,{r4-r6,r14}
|
||||
MOV r14,#8
|
||||
MOV r12,r2
|
||||
LDR r6, =0x00800080
|
||||
ofrintra_v6_lp
|
||||
LDRD r2, [r12],#8 ; r2 = 11110000 r3 = 33332222
|
||||
LDRD r4, [r12],#8 ; r4 = 55554444 r5 = 77776666
|
||||
SUBS r14,r14,#1
|
||||
QADD16 r2, r2, r6
|
||||
QADD16 r3, r3, r6
|
||||
QADD16 r4, r4, r6
|
||||
QADD16 r5, r5, r6
|
||||
USAT16 r2, #8, r2 ; r2 = __11__00
|
||||
USAT16 r3, #8, r3 ; r3 = __33__22
|
||||
USAT16 r4, #8, r4 ; r4 = __55__44
|
||||
USAT16 r5, #8, r5 ; r5 = __77__66
|
||||
ORR r2, r2, r2, LSR #8 ; r2 = __111100
|
||||
ORR r3, r3, r3, LSR #8 ; r3 = __333322
|
||||
ORR r4, r4, r4, LSR #8 ; r4 = __555544
|
||||
ORR r5, r5, r5, LSR #8 ; r5 = __777766
|
||||
PKHBT r2, r2, r3, LSL #16 ; r2 = 33221100
|
||||
PKHBT r3, r4, r5, LSL #16 ; r3 = 77665544
|
||||
STRD r2, [r0], r1
|
||||
BGT ofrintra_v6_lp
|
||||
LDMFD r13!,{r4-r6,PC}
|
||||
ENDP
|
||||
|
||||
oc_frag_recon_inter_v6 PROC
|
||||
; r0 = unsigned char *_dst
|
||||
; r1 = const unsigned char *_src
|
||||
; r2 = int _ystride
|
||||
; r3 = const ogg_int16_t _residue[64]
|
||||
STMFD r13!,{r4-r7,r14}
|
||||
MOV r14,#8
|
||||
ofrinter_v6_lp
|
||||
LDRD r6, [r3], #8 ; r6 = 11110000 r7 = 33332222
|
||||
SUBS r14,r14,#1
|
||||
[ OC_ARM_CAN_UNALIGN_LDRD
|
||||
LDRD r4, [r1], r2 ; Unaligned ; r4 = 33221100 r5 = 77665544
|
||||
|
|
||||
LDR r5, [r1, #4]
|
||||
LDR r4, [r1], r2
|
||||
]
|
||||
PKHBT r12,r6, r7, LSL #16 ; r12= 22220000
|
||||
PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111
|
||||
UXTB16 r6,r4 ; r6 = __22__00
|
||||
UXTB16 r4,r4, ROR #8 ; r4 = __33__11
|
||||
QADD16 r12,r12,r6 ; r12= xx22xx00
|
||||
QADD16 r4, r7, r4 ; r4 = xx33xx11
|
||||
LDRD r6, [r3], #8 ; r6 = 55554444 r7 = 77776666
|
||||
USAT16 r4, #8, r4 ; r4 = __33__11
|
||||
USAT16 r12,#8,r12 ; r12= __22__00
|
||||
ORR r4, r12,r4, LSL #8 ; r4 = 33221100
|
||||
PKHBT r12,r6, r7, LSL #16 ; r12= 66664444
|
||||
PKHTB r7, r7, r6, ASR #16 ; r7 = 77775555
|
||||
UXTB16 r6,r5 ; r6 = __66__44
|
||||
UXTB16 r5,r5, ROR #8 ; r5 = __77__55
|
||||
QADD16 r12,r12,r6 ; r12= xx66xx44
|
||||
QADD16 r5, r7, r5 ; r5 = xx77xx55
|
||||
USAT16 r12,#8, r12 ; r12= __66__44
|
||||
USAT16 r5, #8, r5 ; r4 = __77__55
|
||||
ORR r5, r12,r5, LSL #8 ; r5 = 33221100
|
||||
STRD r4, [r0], r2
|
||||
BGT ofrinter_v6_lp
|
||||
LDMFD r13!,{r4-r7,PC}
|
||||
ENDP
|
||||
|
||||
oc_frag_recon_inter2_v6 PROC
|
||||
; r0 = unsigned char *_dst
|
||||
; r1 = const unsigned char *_src1
|
||||
; r2 = const unsigned char *_src2
|
||||
; r3 = int _ystride
|
||||
LDR r12,[r13]
|
||||
; r12= const ogg_int16_t _residue[64]
|
||||
STMFD r13!,{r4-r9,r14}
|
||||
MOV r14,#8
|
||||
ofrinter2_v6_lp
|
||||
LDRD r6, [r12,#8] ; r6 = 55554444 r7 = 77776666
|
||||
SUBS r14,r14,#1
|
||||
LDR r4, [r1, #4] ; Unaligned ; r4 = src1[1] = 77665544
|
||||
LDR r5, [r2, #4] ; Unaligned ; r5 = src2[1] = 77665544
|
||||
PKHBT r8, r6, r7, LSL #16 ; r8 = 66664444
|
||||
PKHTB r9, r7, r6, ASR #16 ; r9 = 77775555
|
||||
UHADD8 r4, r4, r5 ; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
|
||||
UXTB16 r5, r4 ; r5 = __66__44
|
||||
UXTB16 r4, r4, ROR #8 ; r4 = __77__55
|
||||
QADD16 r8, r8, r5 ; r8 = xx66xx44
|
||||
QADD16 r9, r9, r4 ; r9 = xx77xx55
|
||||
LDRD r6,[r12],#16 ; r6 = 33332222 r7 = 11110000
|
||||
USAT16 r8, #8, r8 ; r8 = __66__44
|
||||
LDR r4, [r1], r3 ; Unaligned ; r4 = src1[0] = 33221100
|
||||
USAT16 r9, #8, r9 ; r9 = __77__55
|
||||
LDR r5, [r2], r3 ; Unaligned ; r5 = src2[0] = 33221100
|
||||
ORR r9, r8, r9, LSL #8 ; r9 = 77665544
|
||||
PKHBT r8, r6, r7, LSL #16 ; r8 = 22220000
|
||||
UHADD8 r4, r4, r5 ; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
|
||||
PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111
|
||||
UXTB16 r5, r4 ; r5 = __22__00
|
||||
UXTB16 r4, r4, ROR #8 ; r4 = __33__11
|
||||
QADD16 r8, r8, r5 ; r8 = xx22xx00
|
||||
QADD16 r7, r7, r4 ; r7 = xx33xx11
|
||||
USAT16 r8, #8, r8 ; r8 = __22__00
|
||||
USAT16 r7, #8, r7 ; r7 = __33__11
|
||||
ORR r8, r8, r7, LSL #8 ; r8 = 33221100
|
||||
STRD r8, [r0], r3
|
||||
BGT ofrinter2_v6_lp
|
||||
LDMFD r13!,{r4-r9,PC}
|
||||
ENDP
|
||||
]
|
||||
|
||||
[ OC_ARM_ASM_NEON
|
||||
EXPORT oc_frag_copy_list_neon
|
||||
EXPORT oc_frag_recon_intra_neon
|
||||
EXPORT oc_frag_recon_inter_neon
|
||||
EXPORT oc_frag_recon_inter2_neon
|
||||
|
||||
oc_frag_copy_list_neon PROC
|
||||
; r0 = _dst_frame
|
||||
; r1 = _src_frame
|
||||
; r2 = _ystride
|
||||
; r3 = _fragis
|
||||
; <> = _nfragis
|
||||
; <> = _frag_buf_offs
|
||||
LDR r12,[r13] ; r12 = _nfragis
|
||||
STMFD r13!,{r4-r7,r14}
|
||||
CMP r12, #1
|
||||
LDRGE r6, [r3] ; r6 = _fragis[fragii]
|
||||
LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs
|
||||
BLT ofcl_neon_end
|
||||
; Stall (2 on Xscale)
|
||||
LDR r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
|
||||
; Stall (on XScale)
|
||||
MOV r7, r6 ; Guarantee PLD points somewhere valid.
|
||||
ofcl_neon_lp
|
||||
ADD r4, r1, r6
|
||||
VLD1.64 {D0}, [r4@64], r2
|
||||
ADD r5, r0, r6
|
||||
VLD1.64 {D1}, [r4@64], r2
|
||||
SUBS r12, r12, #1
|
||||
VLD1.64 {D2}, [r4@64], r2
|
||||
LDRGT r6, [r3,#4]! ; r6 = _fragis[fragii]
|
||||
VLD1.64 {D3}, [r4@64], r2
|
||||
LDRGT r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
|
||||
VLD1.64 {D4}, [r4@64], r2
|
||||
ADDGT r7, r1, r6
|
||||
VLD1.64 {D5}, [r4@64], r2
|
||||
PLD [r7]
|
||||
VLD1.64 {D6}, [r4@64], r2
|
||||
PLD [r7, r2]
|
||||
VLD1.64 {D7}, [r4@64]
|
||||
PLD [r7, r2, LSL #1]
|
||||
VST1.64 {D0}, [r5@64], r2
|
||||
ADDGT r7, r7, r2, LSL #2
|
||||
VST1.64 {D1}, [r5@64], r2
|
||||
PLD [r7, -r2]
|
||||
VST1.64 {D2}, [r5@64], r2
|
||||
PLD [r7]
|
||||
VST1.64 {D3}, [r5@64], r2
|
||||
PLD [r7, r2]
|
||||
VST1.64 {D4}, [r5@64], r2
|
||||
PLD [r7, r2, LSL #1]
|
||||
VST1.64 {D5}, [r5@64], r2
|
||||
ADDGT r7, r7, r2, LSL #2
|
||||
VST1.64 {D6}, [r5@64], r2
|
||||
PLD [r7, -r2]
|
||||
VST1.64 {D7}, [r5@64]
|
||||
BGT ofcl_neon_lp
|
||||
ofcl_neon_end
|
||||
LDMFD r13!,{r4-r7,PC}
|
||||
ENDP
|
||||
|
||||
oc_frag_recon_intra_neon PROC
|
||||
; r0 = unsigned char *_dst
|
||||
; r1 = int _ystride
|
||||
; r2 = const ogg_int16_t _residue[64]
|
||||
MOV r3, #128
|
||||
VDUP.S16 Q0, r3
|
||||
VLDMIA r2, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles
|
||||
VQADD.S16 Q8, Q8, Q0
|
||||
VQADD.S16 Q9, Q9, Q0
|
||||
VQADD.S16 Q10,Q10,Q0
|
||||
VQADD.S16 Q11,Q11,Q0
|
||||
VQADD.S16 Q12,Q12,Q0
|
||||
VQADD.S16 Q13,Q13,Q0
|
||||
VQADD.S16 Q14,Q14,Q0
|
||||
VQADD.S16 Q15,Q15,Q0
|
||||
VQMOVUN.S16 D16,Q8 ; D16= 7766554433221100 ; 1 cycle
|
||||
VQMOVUN.S16 D17,Q9 ; D17= FFEEDDCCBBAA9988 ; 1 cycle
|
||||
VQMOVUN.S16 D18,Q10 ; D18= NNMMLLKKJJIIHHGG ; 1 cycle
|
||||
VST1.64 {D16},[r0@64], r1
|
||||
VQMOVUN.S16 D19,Q11 ; D19= VVUUTTSSRRQQPPOO ; 1 cycle
|
||||
VST1.64 {D17},[r0@64], r1
|
||||
VQMOVUN.S16 D20,Q12 ; D20= ddccbbaaZZYYXXWW ; 1 cycle
|
||||
VST1.64 {D18},[r0@64], r1
|
||||
VQMOVUN.S16 D21,Q13 ; D21= llkkjjiihhggffee ; 1 cycle
|
||||
VST1.64 {D19},[r0@64], r1
|
||||
VQMOVUN.S16 D22,Q14 ; D22= ttssrrqqppoonnmm ; 1 cycle
|
||||
VST1.64 {D20},[r0@64], r1
|
||||
VQMOVUN.S16 D23,Q15 ; D23= !!@@zzyyxxwwvvuu ; 1 cycle
|
||||
VST1.64 {D21},[r0@64], r1
|
||||
VST1.64 {D22},[r0@64], r1
|
||||
VST1.64 {D23},[r0@64], r1
|
||||
MOV PC,R14
|
||||
ENDP
|
||||
|
||||
oc_frag_recon_inter_neon PROC
|
||||
; r0 = unsigned char *_dst
|
||||
; r1 = const unsigned char *_src
|
||||
; r2 = int _ystride
|
||||
; r3 = const ogg_int16_t _residue[64]
|
||||
VLDMIA r3, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles
|
||||
VLD1.64 {D0}, [r1], r2
|
||||
VLD1.64 {D2}, [r1], r2
|
||||
VMOVL.U8 Q0, D0 ; Q0 = __77__66__55__44__33__22__11__00
|
||||
VLD1.64 {D4}, [r1], r2
|
||||
VMOVL.U8 Q1, D2 ; etc
|
||||
VLD1.64 {D6}, [r1], r2
|
||||
VMOVL.U8 Q2, D4
|
||||
VMOVL.U8 Q3, D6
|
||||
VQADD.S16 Q8, Q8, Q0
|
||||
VLD1.64 {D0}, [r1], r2
|
||||
VQADD.S16 Q9, Q9, Q1
|
||||
VLD1.64 {D2}, [r1], r2
|
||||
VQADD.S16 Q10,Q10,Q2
|
||||
VLD1.64 {D4}, [r1], r2
|
||||
VQADD.S16 Q11,Q11,Q3
|
||||
VLD1.64 {D6}, [r1], r2
|
||||
VMOVL.U8 Q0, D0
|
||||
VMOVL.U8 Q1, D2
|
||||
VMOVL.U8 Q2, D4
|
||||
VMOVL.U8 Q3, D6
|
||||
VQADD.S16 Q12,Q12,Q0
|
||||
VQADD.S16 Q13,Q13,Q1
|
||||
VQADD.S16 Q14,Q14,Q2
|
||||
VQADD.S16 Q15,Q15,Q3
|
||||
VQMOVUN.S16 D16,Q8
|
||||
VQMOVUN.S16 D17,Q9
|
||||
VQMOVUN.S16 D18,Q10
|
||||
VST1.64 {D16},[r0@64], r2
|
||||
VQMOVUN.S16 D19,Q11
|
||||
VST1.64 {D17},[r0@64], r2
|
||||
VQMOVUN.S16 D20,Q12
|
||||
VST1.64 {D18},[r0@64], r2
|
||||
VQMOVUN.S16 D21,Q13
|
||||
VST1.64 {D19},[r0@64], r2
|
||||
VQMOVUN.S16 D22,Q14
|
||||
VST1.64 {D20},[r0@64], r2
|
||||
VQMOVUN.S16 D23,Q15
|
||||
VST1.64 {D21},[r0@64], r2
|
||||
VST1.64 {D22},[r0@64], r2
|
||||
VST1.64 {D23},[r0@64], r2
|
||||
MOV PC,R14
|
||||
ENDP
|
||||
|
||||
oc_frag_recon_inter2_neon PROC
|
||||
; r0 = unsigned char *_dst
|
||||
; r1 = const unsigned char *_src1
|
||||
; r2 = const unsigned char *_src2
|
||||
; r3 = int _ystride
|
||||
LDR r12,[r13]
|
||||
; r12= const ogg_int16_t _residue[64]
|
||||
VLDMIA r12,{D16-D31}
|
||||
VLD1.64 {D0}, [r1], r3
|
||||
VLD1.64 {D4}, [r2], r3
|
||||
VLD1.64 {D1}, [r1], r3
|
||||
VLD1.64 {D5}, [r2], r3
|
||||
VHADD.U8 Q2, Q0, Q2 ; Q2 = FFEEDDCCBBAA99887766554433221100
|
||||
VLD1.64 {D2}, [r1], r3
|
||||
VLD1.64 {D6}, [r2], r3
|
||||
VMOVL.U8 Q0, D4 ; Q0 = __77__66__55__44__33__22__11__00
|
||||
VLD1.64 {D3}, [r1], r3
|
||||
VMOVL.U8 Q2, D5 ; etc
|
||||
VLD1.64 {D7}, [r2], r3
|
||||
VHADD.U8 Q3, Q1, Q3
|
||||
VQADD.S16 Q8, Q8, Q0
|
||||
VQADD.S16 Q9, Q9, Q2
|
||||
VLD1.64 {D0}, [r1], r3
|
||||
VMOVL.U8 Q1, D6
|
||||
VLD1.64 {D4}, [r2], r3
|
||||
VMOVL.U8 Q3, D7
|
||||
VLD1.64 {D1}, [r1], r3
|
||||
VQADD.S16 Q10,Q10,Q1
|
||||
VLD1.64 {D5}, [r2], r3
|
||||
VQADD.S16 Q11,Q11,Q3
|
||||
VLD1.64 {D2}, [r1], r3
|
||||
VHADD.U8 Q2, Q0, Q2
|
||||
VLD1.64 {D6}, [r2], r3
|
||||
VLD1.64 {D3}, [r1], r3
|
||||
VMOVL.U8 Q0, D4
|
||||
VLD1.64 {D7}, [r2], r3
|
||||
VMOVL.U8 Q2, D5
|
||||
VHADD.U8 Q3, Q1, Q3
|
||||
VQADD.S16 Q12,Q12,Q0
|
||||
VQADD.S16 Q13,Q13,Q2
|
||||
VMOVL.U8 Q1, D6
|
||||
VMOVL.U8 Q3, D7
|
||||
VQADD.S16 Q14,Q14,Q1
|
||||
VQADD.S16 Q15,Q15,Q3
|
||||
VQMOVUN.S16 D16,Q8
|
||||
VQMOVUN.S16 D17,Q9
|
||||
VQMOVUN.S16 D18,Q10
|
||||
VST1.64 {D16},[r0@64], r3
|
||||
VQMOVUN.S16 D19,Q11
|
||||
VST1.64 {D17},[r0@64], r3
|
||||
VQMOVUN.S16 D20,Q12
|
||||
VST1.64 {D18},[r0@64], r3
|
||||
VQMOVUN.S16 D21,Q13
|
||||
VST1.64 {D19},[r0@64], r3
|
||||
VQMOVUN.S16 D22,Q14
|
||||
VST1.64 {D20},[r0@64], r3
|
||||
VQMOVUN.S16 D23,Q15
|
||||
VST1.64 {D21},[r0@64], r3
|
||||
VST1.64 {D22},[r0@64], r3
|
||||
VST1.64 {D23},[r0@64], r3
|
||||
MOV PC,R14
|
||||
ENDP
|
||||
]
|
||||
|
||||
END
|
1908
media/libtheora/lib/arm/armidct.s
Normal file
1908
media/libtheora/lib/arm/armidct.s
Normal file
File diff suppressed because it is too large
Load Diff
126
media/libtheora/lib/arm/armint.h
Normal file
126
media/libtheora/lib/arm/armint.h
Normal file
@ -0,0 +1,126 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
#if !defined(_arm_armint_H)
|
||||
# define _arm_armint_H (1)
|
||||
# include "../internal.h"
|
||||
|
||||
# if defined(OC_ARM_ASM)
|
||||
|
||||
# if defined(__ARMEB__)
|
||||
# error "Big-endian configurations are not supported by the ARM asm. " \
|
||||
"Reconfigure with --disable-asm or undefine OC_ARM_ASM."
|
||||
# endif
|
||||
|
||||
# define oc_state_accel_init oc_state_accel_init_arm
|
||||
/*This function is implemented entirely in asm, so it's helpful to pull out all
|
||||
of the things that depend on structure offsets.
|
||||
We reuse the function pointer with the wrong prototype, though.*/
|
||||
# define oc_state_loop_filter_frag_rows(_state,_bv,_refi,_pli, \
|
||||
_fragy0,_fragy_end) \
|
||||
((oc_loop_filter_frag_rows_arm_func) \
|
||||
(_state)->opt_vtable.state_loop_filter_frag_rows)( \
|
||||
(_state)->ref_frame_data[(_refi)],(_state)->ref_ystride[(_pli)], \
|
||||
(_bv), \
|
||||
(_state)->frags, \
|
||||
(_state)->fplanes[(_pli)].froffset \
|
||||
+(_fragy0)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
|
||||
(_state)->fplanes[(_pli)].froffset \
|
||||
+(_fragy_end)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
|
||||
(_state)->fplanes[(_pli)].froffset, \
|
||||
(_state)->fplanes[(_pli)].froffset+(_state)->fplanes[(_pli)].nfrags, \
|
||||
(_state)->frag_buf_offs, \
|
||||
(_state)->fplanes[(_pli)].nhfrags)
|
||||
/*For everything else the default vtable macros are fine.*/
|
||||
# define OC_STATE_USE_VTABLE (1)
|
||||
# endif
|
||||
|
||||
# include "../state.h"
|
||||
# include "armcpu.h"
|
||||
|
||||
# if defined(OC_ARM_ASM)
|
||||
typedef void (*oc_loop_filter_frag_rows_arm_func)(
|
||||
unsigned char *_ref_frame_data,int _ystride,signed char _bv[256],
|
||||
const oc_fragment *_frags,ptrdiff_t _fragi0,ptrdiff_t _fragi0_end,
|
||||
ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
|
||||
const ptrdiff_t *_frag_buf_offs,int _nhfrags);
|
||||
|
||||
void oc_state_accel_init_arm(oc_theora_state *_state);
|
||||
void oc_frag_copy_list_arm(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||
void oc_frag_recon_intra_arm(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter_arm(unsigned char *_dst,const unsigned char *_src,
|
||||
int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_idct8x8_1_arm(ogg_int16_t _y[64],ogg_uint16_t _dc);
|
||||
void oc_idct8x8_arm(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_loop_filter_frag_rows_arm(unsigned char *_ref_frame_data,
|
||||
int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
|
||||
ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
|
||||
const ptrdiff_t *_frag_buf_offs,int _nhfrags);
|
||||
|
||||
# if defined(OC_ARM_ASM_EDSP)
|
||||
void oc_frag_copy_list_edsp(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||
|
||||
# if defined(OC_ARM_ASM_MEDIA)
|
||||
void oc_frag_recon_intra_v6(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter_v6(unsigned char *_dst,const unsigned char *_src,
|
||||
int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter2_v6(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_idct8x8_1_v6(ogg_int16_t _y[64],ogg_uint16_t _dc);
|
||||
void oc_idct8x8_v6(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_loop_filter_init_v6(signed char *_bv,int _flimit);
|
||||
void oc_loop_filter_frag_rows_v6(unsigned char *_ref_frame_data,
|
||||
int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
|
||||
ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
|
||||
const ptrdiff_t *_frag_buf_offs,int _nhfrags);
|
||||
|
||||
# if defined(OC_ARM_ASM_NEON)
|
||||
void oc_frag_copy_list_neon(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||
void oc_frag_recon_intra_neon(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter_neon(unsigned char *_dst,const unsigned char *_src,
|
||||
int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter2_neon(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_idct8x8_1_neon(ogg_int16_t _y[64],ogg_uint16_t _dc);
|
||||
void oc_idct8x8_neon(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_loop_filter_init_neon(signed char *_bv,int _flimit);
|
||||
void oc_loop_filter_frag_rows_neon(unsigned char *_ref_frame_data,
|
||||
int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
|
||||
ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
|
||||
const ptrdiff_t *_frag_buf_offs,int _nhfrags);
|
||||
# endif
|
||||
# endif
|
||||
# endif
|
||||
# endif
|
||||
|
||||
#endif
|
676
media/libtheora/lib/arm/armloop.s
Normal file
676
media/libtheora/lib/arm/armloop.s
Normal file
@ -0,0 +1,676 @@
|
||||
;********************************************************************
|
||||
;* *
|
||||
;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
;* *
|
||||
;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
|
||||
;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
;* *
|
||||
;********************************************************************
|
||||
; Original implementation:
|
||||
; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
|
||||
; last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $
|
||||
;********************************************************************
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
GET armopts.s
|
||||
|
||||
EXPORT oc_loop_filter_frag_rows_arm
|
||||
|
||||
; Which bit this is depends on the order of packing within a bitfield.
|
||||
; Hopefully that doesn't change among any of the relevant compilers.
|
||||
OC_FRAG_CODED_FLAG * 1
|
||||
|
||||
; Vanilla ARM v4 version
|
||||
loop_filter_h_arm PROC
|
||||
; r0 = unsigned char *_pix
|
||||
; r1 = int _ystride
|
||||
; r2 = int *_bv
|
||||
; preserves r0-r3
|
||||
STMFD r13!,{r3-r6,r14}
|
||||
MOV r14,#8
|
||||
MOV r6, #255
|
||||
lfh_arm_lp
|
||||
LDRB r3, [r0, #-2] ; r3 = _pix[0]
|
||||
LDRB r12,[r0, #1] ; r12= _pix[3]
|
||||
LDRB r4, [r0, #-1] ; r4 = _pix[1]
|
||||
LDRB r5, [r0] ; r5 = _pix[2]
|
||||
SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4
|
||||
ADD r3, r3, #4
|
||||
SUB r12,r5, r4 ; r12= _pix[2]-_pix[1]
|
||||
ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1])
|
||||
ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
|
||||
MOV r12,r12,ASR #3
|
||||
LDRSB r12,[r2, r12]
|
||||
; Stall (2 on Xscale)
|
||||
ADDS r4, r4, r12
|
||||
CMPGT r6, r4
|
||||
EORLT r4, r6, r4, ASR #32
|
||||
SUBS r5, r5, r12
|
||||
CMPGT r6, r5
|
||||
EORLT r5, r6, r5, ASR #32
|
||||
STRB r4, [r0, #-1]
|
||||
STRB r5, [r0], r1
|
||||
SUBS r14,r14,#1
|
||||
BGT lfh_arm_lp
|
||||
SUB r0, r0, r1, LSL #3
|
||||
LDMFD r13!,{r3-r6,PC}
|
||||
ENDP
|
||||
|
||||
loop_filter_v_arm PROC
|
||||
; r0 = unsigned char *_pix
|
||||
; r1 = int _ystride
|
||||
; r2 = int *_bv
|
||||
; preserves r0-r3
|
||||
STMFD r13!,{r3-r6,r14}
|
||||
MOV r14,#8
|
||||
MOV r6, #255
|
||||
lfv_arm_lp
|
||||
LDRB r3, [r0, -r1, LSL #1] ; r3 = _pix[0]
|
||||
LDRB r12,[r0, r1] ; r12= _pix[3]
|
||||
LDRB r4, [r0, -r1] ; r4 = _pix[1]
|
||||
LDRB r5, [r0] ; r5 = _pix[2]
|
||||
SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4
|
||||
ADD r3, r3, #4
|
||||
SUB r12,r5, r4 ; r12= _pix[2]-_pix[1]
|
||||
ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1])
|
||||
ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
|
||||
MOV r12,r12,ASR #3
|
||||
LDRSB r12,[r2, r12]
|
||||
; Stall (2 on Xscale)
|
||||
ADDS r4, r4, r12
|
||||
CMPGT r6, r4
|
||||
EORLT r4, r6, r4, ASR #32
|
||||
SUBS r5, r5, r12
|
||||
CMPGT r6, r5
|
||||
EORLT r5, r6, r5, ASR #32
|
||||
STRB r4, [r0, -r1]
|
||||
STRB r5, [r0], #1
|
||||
SUBS r14,r14,#1
|
||||
BGT lfv_arm_lp
|
||||
SUB r0, r0, #8
|
||||
LDMFD r13!,{r3-r6,PC}
|
||||
ENDP
|
||||
|
||||
oc_loop_filter_frag_rows_arm PROC
|
||||
; r0 = _ref_frame_data
|
||||
; r1 = _ystride
|
||||
; r2 = _bv
|
||||
; r3 = _frags
|
||||
; r4 = _fragi0
|
||||
; r5 = _fragi0_end
|
||||
; r6 = _fragi_top
|
||||
; r7 = _fragi_bot
|
||||
; r8 = _frag_buf_offs
|
||||
; r9 = _nhfrags
|
||||
MOV r12,r13
|
||||
STMFD r13!,{r0,r4-r11,r14}
|
||||
LDMFD r12,{r4-r9}
|
||||
ADD r2, r2, #127 ; _bv += 127
|
||||
CMP r4, r5 ; if(_fragi0>=_fragi0_end)
|
||||
BGE oslffri_arm_end ; bail
|
||||
SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
|
||||
BLE oslffri_arm_end ; bail
|
||||
ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
|
||||
ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
|
||||
SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
|
||||
oslffri_arm_lp1
|
||||
MOV r10,r4 ; r10= fragi = _fragi0
|
||||
ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
|
||||
oslffri_arm_lp2
|
||||
LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
|
||||
LDR r0, [r13] ; r0 = _ref_frame_data
|
||||
LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
|
||||
TST r14,#OC_FRAG_CODED_FLAG
|
||||
BEQ oslffri_arm_uncoded
|
||||
CMP r10,r4 ; if (fragi>_fragi0)
|
||||
ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
|
||||
BLGT loop_filter_h_arm
|
||||
CMP r4, r6 ; if (_fragi0>_fragi_top)
|
||||
BLGT loop_filter_v_arm
|
||||
CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
|
||||
LDRLT r12,[r3] ; r12 = _frags[fragi+1]
|
||||
ADD r0, r0, #8
|
||||
ADD r10,r10,#1 ; r10 = fragi+1;
|
||||
ANDLT r12,r12,#OC_FRAG_CODED_FLAG
|
||||
CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
|
||||
BLLT loop_filter_h_arm
|
||||
CMP r10,r7 ; if (fragi<_fragi_bot)
|
||||
LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
|
||||
SUB r0, r0, #8
|
||||
ADD r0, r0, r1, LSL #3
|
||||
ANDLT r12,r12,#OC_FRAG_CODED_FLAG
|
||||
CMPLT r12,#OC_FRAG_CODED_FLAG
|
||||
BLLT loop_filter_v_arm
|
||||
CMP r10,r11 ; while(fragi<=fragi_end-1)
|
||||
BLE oslffri_arm_lp2
|
||||
MOV r4, r10 ; r4 = fragi0 += _nhfrags
|
||||
CMP r4, r5
|
||||
BLT oslffri_arm_lp1
|
||||
oslffri_arm_end
|
||||
LDMFD r13!,{r0,r4-r11,PC}
|
||||
oslffri_arm_uncoded
|
||||
ADD r10,r10,#1
|
||||
CMP r10,r11
|
||||
BLE oslffri_arm_lp2
|
||||
MOV r4, r10 ; r4 = _fragi0 += _nhfrags
|
||||
CMP r4, r5
|
||||
BLT oslffri_arm_lp1
|
||||
LDMFD r13!,{r0,r4-r11,PC}
|
||||
ENDP
|
||||
|
||||
[ OC_ARM_ASM_MEDIA
|
||||
EXPORT oc_loop_filter_init_v6
|
||||
EXPORT oc_loop_filter_frag_rows_v6
|
||||
|
||||
oc_loop_filter_init_v6 PROC
|
||||
; r0 = _bv
|
||||
; r1 = _flimit (=L from the spec)
|
||||
MVN r1, r1, LSL #1 ; r1 = <0xFFFFFF|255-2*L>
|
||||
AND r1, r1, #255 ; r1 = ll=r1&0xFF
|
||||
ORR r1, r1, r1, LSL #8 ; r1 = <ll|ll>
|
||||
PKHBT r1, r1, r1, LSL #16 ; r1 = <ll|ll|ll|ll>
|
||||
STR r1, [r0]
|
||||
MOV PC,r14
|
||||
ENDP
|
||||
|
||||
; We could use the same strategy as the v filter below, but that would require
|
||||
; 40 instructions to load the data and transpose it into columns and another
|
||||
; 32 to write out the results at the end, plus the 52 instructions to do the
|
||||
; filtering itself.
|
||||
; This is slightly less, and less code, even assuming we could have shared the
|
||||
; 52 instructions in the middle with the other function.
|
||||
; It executes slightly fewer instructions than the ARMv6 approach David Conrad
|
||||
; proposed for FFmpeg, but not by much:
|
||||
; http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
|
||||
; His is a lot less code, though, because it only does two rows at once instead
|
||||
; of four.
|
||||
loop_filter_h_v6 PROC
|
||||
; r0 = unsigned char *_pix
|
||||
; r1 = int _ystride
|
||||
; r2 = int _ll
|
||||
; preserves r0-r3
|
||||
STMFD r13!,{r4-r11,r14}
|
||||
LDR r12,=0x10003
|
||||
BL loop_filter_h_core_v6
|
||||
ADD r0, r0, r1, LSL #2
|
||||
BL loop_filter_h_core_v6
|
||||
SUB r0, r0, r1, LSL #2
|
||||
LDMFD r13!,{r4-r11,PC}
|
||||
ENDP
|
||||
|
||||
loop_filter_h_core_v6 PROC
|
||||
; r0 = unsigned char *_pix
|
||||
; r1 = int _ystride
|
||||
; r2 = int _ll
|
||||
; r12= 0x10003
|
||||
; Preserves r0-r3, r12; Clobbers r4-r11.
|
||||
LDR r4,[r0, #-2]! ; r4 = <p3|p2|p1|p0>
|
||||
; Single issue
|
||||
LDR r5,[r0, r1]! ; r5 = <q3|q2|q1|q0>
|
||||
UXTB16 r6, r4, ROR #16 ; r6 = <p0|p2>
|
||||
UXTB16 r4, r4, ROR #8 ; r4 = <p3|p1>
|
||||
UXTB16 r7, r5, ROR #16 ; r7 = <q0|q2>
|
||||
UXTB16 r5, r5, ROR #8 ; r5 = <q3|q1>
|
||||
PKHBT r8, r4, r5, LSL #16 ; r8 = <__|q1|__|p1>
|
||||
PKHBT r9, r6, r7, LSL #16 ; r9 = <__|q2|__|p2>
|
||||
SSUB16 r6, r4, r6 ; r6 = <p3-p0|p1-p2>
|
||||
SMLAD r6, r6, r12,r12 ; r6 = <????|(p3-p0)+3*(p1-p2)+3>
|
||||
SSUB16 r7, r5, r7 ; r7 = <q3-q0|q1-q2>
|
||||
SMLAD r7, r7, r12,r12 ; r7 = <????|(q0-q3)+3*(q2-q1)+4>
|
||||
LDR r4,[r0, r1]! ; r4 = <r3|r2|r1|r0>
|
||||
MOV r6, r6, ASR #3 ; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
|
||||
LDR r5,[r0, r1]! ; r5 = <s3|s2|s1|s0>
|
||||
PKHBT r11,r6, r7, LSL #13 ; r11= <??|-R_q|??|-R_p>
|
||||
UXTB16 r6, r4, ROR #16 ; r6 = <r0|r2>
|
||||
UXTB16 r11,r11 ; r11= <__|-R_q|__|-R_p>
|
||||
UXTB16 r4, r4, ROR #8 ; r4 = <r3|r1>
|
||||
UXTB16 r7, r5, ROR #16 ; r7 = <s0|s2>
|
||||
PKHBT r10,r6, r7, LSL #16 ; r10= <__|s2|__|r2>
|
||||
SSUB16 r6, r4, r6 ; r6 = <r3-r0|r1-r2>
|
||||
UXTB16 r5, r5, ROR #8 ; r5 = <s3|s1>
|
||||
SMLAD r6, r6, r12,r12 ; r6 = <????|(r3-r0)+3*(r2-r1)+3>
|
||||
SSUB16 r7, r5, r7 ; r7 = <r3-r0|r1-r2>
|
||||
SMLAD r7, r7, r12,r12 ; r7 = <????|(s0-s3)+3*(s2-s1)+4>
|
||||
ORR r9, r9, r10, LSL #8 ; r9 = <s2|q2|r2|p2>
|
||||
MOV r6, r6, ASR #3 ; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
|
||||
PKHBT r10,r4, r5, LSL #16 ; r10= <__|s1|__|r1>
|
||||
PKHBT r6, r6, r7, LSL #13 ; r6 = <??|-R_s|??|-R_r>
|
||||
ORR r8, r8, r10, LSL #8 ; r8 = <s1|q1|r1|p1>
|
||||
UXTB16 r6, r6 ; r6 = <__|-R_s|__|-R_r>
|
||||
MOV r10,#0
|
||||
ORR r6, r11,r6, LSL #8 ; r6 = <-R_s|-R_q|-R_r|-R_p>
|
||||
; Single issue
|
||||
; There's no min, max or abs instruction.
|
||||
; SSUB8 and SEL will work for abs, and we can do all the rest with
|
||||
; unsigned saturated adds, which means the GE flags are still all
|
||||
; set when we're done computing lflim(abs(R_i),L).
|
||||
; This allows us to both add and subtract, and split the results by
|
||||
; the original sign of R_i.
|
||||
SSUB8 r7, r10,r6
|
||||
; Single issue
|
||||
SEL r7, r7, r6 ; r7 = abs(R_i)
|
||||
; Single issue
|
||||
UQADD8 r4, r7, r2 ; r4 = 255-max(2*L-abs(R_i),0)
|
||||
; Single issue
|
||||
UQADD8 r7, r7, r4
|
||||
; Single issue
|
||||
UQSUB8 r7, r7, r4 ; r7 = min(abs(R_i),max(2*L-abs(R_i),0))
|
||||
; Single issue
|
||||
UQSUB8 r4, r8, r7
|
||||
UQADD8 r5, r9, r7
|
||||
UQADD8 r8, r8, r7
|
||||
UQSUB8 r9, r9, r7
|
||||
SEL r8, r8, r4 ; r8 = p1+lflim(R_i,L)
|
||||
SEL r9, r9, r5 ; r9 = p2-lflim(R_i,L)
|
||||
MOV r5, r9, LSR #24 ; r5 = s2
|
||||
STRB r5, [r0,#2]!
|
||||
MOV r4, r8, LSR #24 ; r4 = s1
|
||||
STRB r4, [r0,#-1]
|
||||
MOV r5, r9, LSR #8 ; r5 = r2
|
||||
STRB r5, [r0,-r1]!
|
||||
MOV r4, r8, LSR #8 ; r4 = r1
|
||||
STRB r4, [r0,#-1]
|
||||
MOV r5, r9, LSR #16 ; r5 = q2
|
||||
STRB r5, [r0,-r1]!
|
||||
MOV r4, r8, LSR #16 ; r4 = q1
|
||||
STRB r4, [r0,#-1]
|
||||
; Single issue
|
||||
STRB r9, [r0,-r1]!
|
||||
; Single issue
|
||||
STRB r8, [r0,#-1]
|
||||
MOV PC,r14
|
||||
ENDP
|
||||
|
||||
; This uses the same strategy as the MMXEXT version for x86, except that UHADD8
|
||||
; computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
|
||||
; This works just as well, with the following procedure for computing the
|
||||
; filter value, f:
|
||||
; u = ~UHADD8(p1,~p2);
|
||||
; v = UHADD8(~p1,p2);
|
||||
; m = v-u;
|
||||
; a = m^UHADD8(m^p0,m^~p3);
|
||||
; f = UHADD8(UHADD8(a,u1),v1);
|
||||
; where f = 127+R, with R in [-127,128] defined as in the spec.
|
||||
; This is exactly the same amount of arithmetic as the version that uses PAVGB
|
||||
; as the basic operator.
|
||||
; It executes about 2/3 the number of instructions of David Conrad's approach,
|
||||
; but requires more code, because it does all eight columns at once, instead
|
||||
; of four at a time.
|
||||
loop_filter_v_v6 PROC
|
||||
; r0 = unsigned char *_pix
|
||||
; r1 = int _ystride
|
||||
; r2 = int _ll
|
||||
; preserves r0-r11
|
||||
STMFD r13!,{r4-r11,r14}
|
||||
LDRD r6, [r0, -r1]! ; r7, r6 = <p5|p1>
|
||||
LDRD r4, [r0, -r1] ; r5, r4 = <p4|p0>
|
||||
LDRD r8, [r0, r1]! ; r9, r8 = <p6|p2>
|
||||
MVN r14,r6 ; r14= ~p1
|
||||
LDRD r10,[r0, r1] ; r11,r10= <p7|p3>
|
||||
; Filter the first four columns.
|
||||
MVN r12,r8 ; r12= ~p2
|
||||
UHADD8 r14,r14,r8 ; r14= v1=~p1+p2>>1
|
||||
UHADD8 r12,r12,r6 ; r12= p1+~p2>>1
|
||||
MVN r10, r10 ; r10=~p3
|
||||
MVN r12,r12 ; r12= u1=~p1+p2+1>>1
|
||||
SSUB8 r14,r14,r12 ; r14= m1=v1-u1
|
||||
; Single issue
|
||||
EOR r4, r4, r14 ; r4 = m1^p0
|
||||
EOR r10,r10,r14 ; r10= m1^~p3
|
||||
UHADD8 r4, r4, r10 ; r4 = (m1^p0)+(m1^~p3)>>1
|
||||
; Single issue
|
||||
EOR r4, r4, r14 ; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
|
||||
SADD8 r14,r14,r12 ; r14= v1=m1+u1
|
||||
UHADD8 r4, r4, r12 ; r4 = a1+u1>>1
|
||||
MVN r12,r9 ; r12= ~p6
|
||||
UHADD8 r4, r4, r14 ; r4 = f1=(a1+u1>>1)+v1>>1
|
||||
; Filter the second four columns.
|
||||
MVN r14,r7 ; r14= ~p5
|
||||
UHADD8 r12,r12,r7 ; r12= p5+~p6>>1
|
||||
UHADD8 r14,r14,r9 ; r14= v2=~p5+p6>>1
|
||||
MVN r12,r12 ; r12= u2=~p5+p6+1>>1
|
||||
MVN r11,r11 ; r11=~p7
|
||||
SSUB8 r10,r14,r12 ; r10= m2=v2-u2
|
||||
; Single issue
|
||||
EOR r5, r5, r10 ; r5 = m2^p4
|
||||
EOR r11,r11,r10 ; r11= m2^~p7
|
||||
UHADD8 r5, r5, r11 ; r5 = (m2^p4)+(m2^~p7)>>1
|
||||
; Single issue
|
||||
EOR r5, r5, r10 ; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
|
||||
; Single issue
|
||||
UHADD8 r5, r5, r12 ; r5 = a2+u2>>1
|
||||
LDR r12,=0x7F7F7F7F ; r12 = {127}x4
|
||||
UHADD8 r5, r5, r14 ; r5 = f2=(a2+u2>>1)+v2>>1
|
||||
; Now split f[i] by sign.
|
||||
; There's no min or max instruction.
|
||||
; We could use SSUB8 and SEL, but this is just as many instructions and
|
||||
; dual issues more (for v7 without NEON).
|
||||
UQSUB8 r10,r4, r12 ; r10= R_i>0?R_i:0
|
||||
UQSUB8 r4, r12,r4 ; r4 = R_i<0?-R_i:0
|
||||
UQADD8 r11,r10,r2 ; r11= 255-max(2*L-abs(R_i<0),0)
|
||||
UQADD8 r14,r4, r2 ; r14= 255-max(2*L-abs(R_i>0),0)
|
||||
UQADD8 r10,r10,r11
|
||||
UQADD8 r4, r4, r14
|
||||
UQSUB8 r10,r10,r11 ; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
|
||||
UQSUB8 r4, r4, r14 ; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
|
||||
UQSUB8 r11,r5, r12 ; r11= R_i>0?R_i:0
|
||||
UQADD8 r6, r6, r10
|
||||
UQSUB8 r8, r8, r10
|
||||
UQSUB8 r5, r12,r5 ; r5 = R_i<0?-R_i:0
|
||||
UQSUB8 r6, r6, r4 ; r6 = p1+lflim(R_i,L)
|
||||
UQADD8 r8, r8, r4 ; r8 = p2-lflim(R_i,L)
|
||||
UQADD8 r10,r11,r2 ; r10= 255-max(2*L-abs(R_i<0),0)
|
||||
UQADD8 r14,r5, r2 ; r14= 255-max(2*L-abs(R_i>0),0)
|
||||
UQADD8 r11,r11,r10
|
||||
UQADD8 r5, r5, r14
|
||||
UQSUB8 r11,r11,r10 ; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
|
||||
UQSUB8 r5, r5, r14 ; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
|
||||
UQADD8 r7, r7, r11
|
||||
UQSUB8 r9, r9, r11
|
||||
UQSUB8 r7, r7, r5 ; r7 = p5+lflim(R_i,L)
|
||||
STRD r6, [r0, -r1] ; [p5:p1] = [r7: r6]
|
||||
UQADD8 r9, r9, r5 ; r9 = p6-lflim(R_i,L)
|
||||
STRD r8, [r0] ; [p6:p2] = [r9: r8]
|
||||
LDMFD r13!,{r4-r11,PC}
|
||||
ENDP
|
||||
|
||||
oc_loop_filter_frag_rows_v6 PROC
|
||||
; r0 = _ref_frame_data
|
||||
; r1 = _ystride
|
||||
; r2 = _bv
|
||||
; r3 = _frags
|
||||
; r4 = _fragi0
|
||||
; r5 = _fragi0_end
|
||||
; r6 = _fragi_top
|
||||
; r7 = _fragi_bot
|
||||
; r8 = _frag_buf_offs
|
||||
; r9 = _nhfrags
|
||||
MOV r12,r13
|
||||
STMFD r13!,{r0,r4-r11,r14}
|
||||
LDMFD r12,{r4-r9}
|
||||
LDR r2, [r2] ; ll = *(int *)_bv
|
||||
CMP r4, r5 ; if(_fragi0>=_fragi0_end)
|
||||
BGE oslffri_v6_end ; bail
|
||||
SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
|
||||
BLE oslffri_v6_end ; bail
|
||||
ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
|
||||
ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
|
||||
SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
|
||||
oslffri_v6_lp1
|
||||
MOV r10,r4 ; r10= fragi = _fragi0
|
||||
ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
|
||||
oslffri_v6_lp2
|
||||
LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
|
||||
LDR r0, [r13] ; r0 = _ref_frame_data
|
||||
LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
|
||||
TST r14,#OC_FRAG_CODED_FLAG
|
||||
BEQ oslffri_v6_uncoded
|
||||
CMP r10,r4 ; if (fragi>_fragi0)
|
||||
ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
|
||||
BLGT loop_filter_h_v6
|
||||
CMP r4, r6 ; if (fragi0>_fragi_top)
|
||||
BLGT loop_filter_v_v6
|
||||
CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
|
||||
LDRLT r12,[r3] ; r12 = _frags[fragi+1]
|
||||
ADD r0, r0, #8
|
||||
ADD r10,r10,#1 ; r10 = fragi+1;
|
||||
ANDLT r12,r12,#OC_FRAG_CODED_FLAG
|
||||
CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
|
||||
BLLT loop_filter_h_v6
|
||||
CMP r10,r7 ; if (fragi<_fragi_bot)
|
||||
LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
|
||||
SUB r0, r0, #8
|
||||
ADD r0, r0, r1, LSL #3
|
||||
ANDLT r12,r12,#OC_FRAG_CODED_FLAG
|
||||
CMPLT r12,#OC_FRAG_CODED_FLAG
|
||||
BLLT loop_filter_v_v6
|
||||
CMP r10,r11 ; while(fragi<=fragi_end-1)
|
||||
BLE oslffri_v6_lp2
|
||||
MOV r4, r10 ; r4 = fragi0 += nhfrags
|
||||
CMP r4, r5
|
||||
BLT oslffri_v6_lp1
|
||||
oslffri_v6_end
|
||||
LDMFD r13!,{r0,r4-r11,PC}
|
||||
oslffri_v6_uncoded
|
||||
ADD r10,r10,#1
|
||||
CMP r10,r11
|
||||
BLE oslffri_v6_lp2
|
||||
MOV r4, r10 ; r4 = fragi0 += nhfrags
|
||||
CMP r4, r5
|
||||
BLT oslffri_v6_lp1
|
||||
LDMFD r13!,{r0,r4-r11,PC}
|
||||
ENDP
|
||||
]
|
||||
|
||||
[ OC_ARM_ASM_NEON
|
||||
EXPORT oc_loop_filter_init_neon
|
||||
EXPORT oc_loop_filter_frag_rows_neon
|
||||
|
||||
oc_loop_filter_init_neon PROC
|
||||
; r0 = _bv
|
||||
; r1 = _flimit (=L from the spec)
|
||||
MOV r1, r1, LSL #1 ; r1 = 2*L
|
||||
VDUP.S16 Q15, r1 ; Q15= 2L in U16s
|
||||
VST1.64 {D30,D31}, [r0@128]
|
||||
MOV PC,r14
|
||||
ENDP
|
||||
|
||||
loop_filter_h_neon PROC
|
||||
; r0 = unsigned char *_pix
|
||||
; r1 = int _ystride
|
||||
; r2 = int *_bv
|
||||
; preserves r0-r3
|
||||
; We assume Q15= 2*L in U16s
|
||||
; My best guesses at cycle counts (and latency)--vvv
|
||||
SUB r12,r0, #2
|
||||
; Doing a 2-element structure load saves doing two VTRN's below, at the
|
||||
; cost of using two more slower single-lane loads vs. the faster
|
||||
; all-lane loads.
|
||||
; It's less code this way, though, and benches a hair faster, but it
|
||||
; leaves D2 and D4 swapped.
|
||||
VLD2.16 {D0[],D2[]}, [r12], r1 ; D0 = ____________1100 2,1
|
||||
; D2 = ____________3322
|
||||
VLD2.16 {D4[],D6[]}, [r12], r1 ; D4 = ____________5544 2,1
|
||||
; D6 = ____________7766
|
||||
VLD2.16 {D0[1],D2[1]},[r12], r1 ; D0 = ________99881100 3,1
|
||||
; D2 = ________BBAA3322
|
||||
VLD2.16 {D4[1],D6[1]},[r12], r1 ; D4 = ________DDCC5544 3,1
|
||||
; D6 = ________FFEE7766
|
||||
VLD2.16 {D0[2],D2[2]},[r12], r1 ; D0 = ____GGHH99881100 3,1
|
||||
; D2 = ____JJIIBBAA3322
|
||||
VLD2.16 {D4[2],D6[2]},[r12], r1 ; D4 = ____KKLLDDCC5544 3,1
|
||||
; D6 = ____NNMMFFEE7766
|
||||
VLD2.16 {D0[3],D2[3]},[r12], r1 ; D0 = PPOOGGHH99881100 3,1
|
||||
; D2 = RRQQJJIIBBAA3322
|
||||
VLD2.16 {D4[3],D6[3]},[r12], r1 ; D4 = TTSSKKLLDDCC5544 3,1
|
||||
; D6 = VVUUNNMMFFEE7766
|
||||
VTRN.8 D0, D4 ; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511 1,1
|
||||
VTRN.8 D2, D6 ; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733 1,1
|
||||
VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3
|
||||
VSUBL.U8 Q8, D2, D4 ; Q8 = 22 - 11 in S16s 1,3
|
||||
ADD r12,r0, #8
|
||||
VADD.S16 Q0, Q0, Q8 ; 1,3
|
||||
PLD [r12]
|
||||
VADD.S16 Q0, Q0, Q8 ; 1,3
|
||||
PLD [r12,r1]
|
||||
VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3
|
||||
PLD [r12,r1, LSL #1]
|
||||
VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
|
||||
ADD r12,r12,r1, LSL #2
|
||||
; We want to do
|
||||
; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
|
||||
; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0)))
|
||||
; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
|
||||
; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
|
||||
; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
|
||||
; So we've reduced the left and right hand terms to be the same, except
|
||||
; for a negation.
|
||||
; Stall x3
|
||||
VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4
|
||||
PLD [r12,-r1]
|
||||
VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3
|
||||
PLD [r12]
|
||||
VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4
|
||||
PLD [r12,r1]
|
||||
VMOVL.U8 Q1, D2 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3
|
||||
PLD [r12,r1,LSL #1]
|
||||
VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4
|
||||
ADD r12,r12,r1, LSL #2
|
||||
; Now we need to correct for the sign of f.
|
||||
; For negative elements of Q0, we want to subtract the appropriate
|
||||
; element of Q9. For positive elements we want to add them. No NEON
|
||||
; instruction exists to do this, so we need to negate the negative
|
||||
; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
|
||||
VADD.S16 Q9, Q9, Q0 ; 1,3
|
||||
PLD [r12,-r1]
|
||||
VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3
|
||||
; Bah. No VRSBW.U8
|
||||
; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
|
||||
VADDW.U8 Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3
|
||||
VSUB.S16 Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3
|
||||
VQMOVUN.S16 D4, Q2 ; D4 = TTPPLLHHDD995511 1,1
|
||||
VQMOVUN.S16 D2, Q1 ; D2 = UUQQMMIIEEAA6622 1,1
|
||||
SUB r12,r0, #1
|
||||
VTRN.8 D4, D2 ; D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1
|
||||
VST1.16 {D4[0]}, [r12], r1
|
||||
VST1.16 {D2[0]}, [r12], r1
|
||||
VST1.16 {D4[1]}, [r12], r1
|
||||
VST1.16 {D2[1]}, [r12], r1
|
||||
VST1.16 {D4[2]}, [r12], r1
|
||||
VST1.16 {D2[2]}, [r12], r1
|
||||
VST1.16 {D4[3]}, [r12], r1
|
||||
VST1.16 {D2[3]}, [r12], r1
|
||||
MOV PC,r14
|
||||
ENDP
|
||||
|
||||
loop_filter_v_neon PROC
|
||||
; r0 = unsigned char *_pix
|
||||
; r1 = int _ystride
|
||||
; r2 = int *_bv
|
||||
; preserves r0-r3
|
||||
; We assume Q15= 2*L in U16s
|
||||
; My best guesses at cycle counts (and latency)--vvv
|
||||
SUB r12,r0, r1, LSL #1
|
||||
VLD1.64 {D0}, [r12@64], r1 ; D0 = SSOOKKGGCC884400 2,1
|
||||
VLD1.64 {D2}, [r12@64], r1 ; D2 = TTPPLLHHDD995511 2,1
|
||||
VLD1.64 {D4}, [r12@64], r1 ; D4 = UUQQMMIIEEAA6622 2,1
|
||||
VLD1.64 {D6}, [r12@64] ; D6 = VVRRNNJJFFBB7733 2,1
|
||||
VSUBL.U8 Q8, D4, D2 ; Q8 = 22 - 11 in S16s 1,3
|
||||
VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3
|
||||
ADD r12, #8
|
||||
VADD.S16 Q0, Q0, Q8 ; 1,3
|
||||
PLD [r12]
|
||||
VADD.S16 Q0, Q0, Q8 ; 1,3
|
||||
PLD [r12,r1]
|
||||
VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3
|
||||
SUB r12, r0, r1
|
||||
VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
|
||||
; We want to do
|
||||
; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
|
||||
; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0)))
|
||||
; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
|
||||
; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
|
||||
; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
|
||||
; So we've reduced the left and right hand terms to be the same, except
|
||||
; for a negation.
|
||||
; Stall x3
|
||||
VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4
|
||||
VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3
|
||||
; Stall x2
|
||||
VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4
|
||||
VMOVL.U8 Q2, D4 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3
|
||||
; Stall x2
|
||||
VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4
|
||||
; Now we need to correct for the sign of f.
|
||||
; For negative elements of Q0, we want to subtract the appropriate
|
||||
; element of Q9. For positive elements we want to add them. No NEON
|
||||
; instruction exists to do this, so we need to negate the negative
|
||||
; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
|
||||
; Stall x3
|
||||
VADD.S16 Q9, Q9, Q0 ; 1,3
|
||||
; Stall x2
|
||||
VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3
|
||||
; Bah. No VRSBW.U8
|
||||
; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
|
||||
VADDW.U8 Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3
|
||||
VSUB.S16 Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3
|
||||
VQMOVUN.S16 D2, Q1 ; D2 = TTPPLLHHDD995511 1,1
|
||||
VQMOVUN.S16 D4, Q2 ; D4 = UUQQMMIIEEAA6622 1,1
|
||||
VST1.64 {D2}, [r12@64], r1
|
||||
VST1.64 {D4}, [r12@64], r1
|
||||
MOV PC,r14
|
||||
ENDP
|
||||
|
||||
oc_loop_filter_frag_rows_neon PROC
|
||||
; r0 = _ref_frame_data
|
||||
; r1 = _ystride
|
||||
; r2 = _bv
|
||||
; r3 = _frags
|
||||
; r4 = _fragi0
|
||||
; r5 = _fragi0_end
|
||||
; r6 = _fragi_top
|
||||
; r7 = _fragi_bot
|
||||
; r8 = _frag_buf_offs
|
||||
; r9 = _nhfrags
|
||||
MOV r12,r13
|
||||
STMFD r13!,{r0,r4-r11,r14}
|
||||
LDMFD r12,{r4-r9}
|
||||
CMP r4, r5 ; if(_fragi0>=_fragi0_end)
|
||||
BGE oslffri_neon_end; bail
|
||||
SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
|
||||
BLE oslffri_neon_end ; bail
|
||||
VLD1.64 {D30,D31}, [r2@128] ; Q15= 2L in U16s
|
||||
ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
|
||||
ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
|
||||
SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
|
||||
oslffri_neon_lp1
|
||||
MOV r10,r4 ; r10= fragi = _fragi0
|
||||
ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
|
||||
oslffri_neon_lp2
|
||||
LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
|
||||
LDR r0, [r13] ; r0 = _ref_frame_data
|
||||
LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
|
||||
TST r14,#OC_FRAG_CODED_FLAG
|
||||
BEQ oslffri_neon_uncoded
|
||||
CMP r10,r4 ; if (fragi>_fragi0)
|
||||
ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
|
||||
BLGT loop_filter_h_neon
|
||||
CMP r4, r6 ; if (_fragi0>_fragi_top)
|
||||
BLGT loop_filter_v_neon
|
||||
CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
|
||||
LDRLT r12,[r3] ; r12 = _frags[fragi+1]
|
||||
ADD r0, r0, #8
|
||||
ADD r10,r10,#1 ; r10 = fragi+1;
|
||||
ANDLT r12,r12,#OC_FRAG_CODED_FLAG
|
||||
CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
|
||||
BLLT loop_filter_h_neon
|
||||
CMP r10,r7 ; if (fragi<_fragi_bot)
|
||||
LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
|
||||
SUB r0, r0, #8
|
||||
ADD r0, r0, r1, LSL #3
|
||||
ANDLT r12,r12,#OC_FRAG_CODED_FLAG
|
||||
CMPLT r12,#OC_FRAG_CODED_FLAG
|
||||
BLLT loop_filter_v_neon
|
||||
CMP r10,r11 ; while(fragi<=fragi_end-1)
|
||||
BLE oslffri_neon_lp2
|
||||
MOV r4, r10 ; r4 = _fragi0 += _nhfrags
|
||||
CMP r4, r5
|
||||
BLT oslffri_neon_lp1
|
||||
oslffri_neon_end
|
||||
LDMFD r13!,{r0,r4-r11,PC}
|
||||
oslffri_neon_uncoded
|
||||
ADD r10,r10,#1
|
||||
CMP r10,r11
|
||||
BLE oslffri_neon_lp2
|
||||
MOV r4, r10 ; r4 = _fragi0 += _nhfrags
|
||||
CMP r4, r5
|
||||
BLT oslffri_neon_lp1
|
||||
LDMFD r13!,{r0,r4-r11,PC}
|
||||
ENDP
|
||||
]
|
||||
|
||||
END
|
39
media/libtheora/lib/arm/armopts.s
Normal file
39
media/libtheora/lib/arm/armopts.s
Normal file
@ -0,0 +1,39 @@
|
||||
;********************************************************************
|
||||
;* *
|
||||
;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
;* *
|
||||
;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
|
||||
;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
;* *
|
||||
;********************************************************************
|
||||
; Original implementation:
|
||||
; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
|
||||
; last mod: $Id: armopts.s.in 17430 2010-09-22 21:54:09Z tterribe $
|
||||
;********************************************************************
|
||||
|
||||
; Set the following to 1 if we have EDSP instructions
|
||||
; (LDRD/STRD, etc., ARMv5E and later).
|
||||
OC_ARM_ASM_EDSP * 1
|
||||
|
||||
; Set the following to 1 if we have ARMv6 media instructions.
|
||||
OC_ARM_ASM_MEDIA * 1
|
||||
|
||||
; Set the following to 1 if we have NEON (some ARMv7)
|
||||
OC_ARM_ASM_NEON * 1
|
||||
|
||||
; Set the following to 1 if LDR/STR can work on unaligned addresses
|
||||
; This is assumed to be true for ARMv6 and later code
|
||||
OC_ARM_CAN_UNALIGN * 0
|
||||
|
||||
; Large unaligned loads and stores are often configured to cause an exception.
|
||||
; They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store)
|
||||
; boundary, so it's usually a bad idea to use them anyway if they can be
|
||||
; avoided.
|
||||
|
||||
; Set the following to 1 if LDRD/STRD can work on unaligned addresses
|
||||
OC_ARM_CAN_UNALIGN_LDRD * 0
|
||||
|
||||
END
|
219
media/libtheora/lib/arm/armstate.c
Normal file
219
media/libtheora/lib/arm/armstate.c
Normal file
@ -0,0 +1,219 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
#include "armint.h"
|
||||
|
||||
#if defined(OC_ARM_ASM)
|
||||
|
||||
# if defined(OC_ARM_ASM_NEON)
|
||||
/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
|
||||
the destination.*/
|
||||
static const unsigned char OC_FZIG_ZAG_NEON[128]={
|
||||
0, 8, 1, 2, 9,16,24,17,
|
||||
10, 3, 4,11,18,25,32,40,
|
||||
33,26,19,12, 5, 6,13,20,
|
||||
27,34,41,48,56,49,42,35,
|
||||
28,21,14, 7,15,22,29,36,
|
||||
43,50,57,58,51,44,37,30,
|
||||
23,31,38,45,52,59,60,53,
|
||||
46,39,47,54,61,62,55,63,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64
|
||||
};
|
||||
# endif
|
||||
|
||||
void oc_state_accel_init_arm(oc_theora_state *_state){
|
||||
oc_state_accel_init_c(_state);
|
||||
_state->cpu_flags=oc_cpu_flags_get();
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_arm;
|
||||
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_arm;
|
||||
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm;
|
||||
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm;
|
||||
_state->opt_vtable.idct8x8=oc_idct8x8_arm;
|
||||
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_arm;
|
||||
/*Note: We _must_ set this function pointer, because the macro in armint.h
|
||||
calls it with different arguments, so the C version will segfault.*/
|
||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||
(oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_arm;
|
||||
# endif
|
||||
# if defined(OC_ARM_ASM_EDSP)
|
||||
if(_state->cpu_flags&OC_CPU_ARM_EDSP){
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_edsp;
|
||||
# endif
|
||||
}
|
||||
# if defined(OC_ARM_ASM_MEDIA)
|
||||
if(_state->cpu_flags&OC_CPU_ARM_MEDIA){
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_v6;
|
||||
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_v6;
|
||||
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_v6;
|
||||
_state->opt_vtable.idct8x8=oc_idct8x8_v6;
|
||||
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_v6;
|
||||
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_v6;
|
||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||
(oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_v6;
|
||||
# endif
|
||||
}
|
||||
# if defined(OC_ARM_ASM_NEON)
|
||||
if(_state->cpu_flags&OC_CPU_ARM_NEON){
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_neon;
|
||||
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_neon;
|
||||
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_neon;
|
||||
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_neon;
|
||||
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_neon;
|
||||
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_neon;
|
||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||
(oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_neon;
|
||||
_state->opt_vtable.idct8x8=oc_idct8x8_neon;
|
||||
# endif
|
||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_NEON;
|
||||
}
|
||||
# endif
|
||||
# endif
|
||||
# endif
|
||||
}
|
||||
|
||||
void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||
unsigned char *dst;
|
||||
ptrdiff_t frag_buf_off;
|
||||
int ystride;
|
||||
int refi;
|
||||
/*Apply the inverse transform.*/
|
||||
/*Special case only having a DC component.*/
|
||||
if(_last_zzi<2){
|
||||
ogg_uint16_t p;
|
||||
/*We round this dequant product (and not any of the others) because there's
|
||||
no iDCT rounding.*/
|
||||
p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
|
||||
oc_idct8x8_1_arm(_dct_coeffs+64,p);
|
||||
}
|
||||
else{
|
||||
/*First, dequantize the DC coefficient.*/
|
||||
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
||||
oc_idct8x8_arm(_dct_coeffs+64,_dct_coeffs,_last_zzi);
|
||||
}
|
||||
/*Fill in the target buffer.*/
|
||||
frag_buf_off=_state->frag_buf_offs[_fragi];
|
||||
refi=_state->frags[_fragi].refi;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
|
||||
if(refi==OC_FRAME_SELF)oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64);
|
||||
else{
|
||||
const unsigned char *ref;
|
||||
int mvoffsets[2];
|
||||
ref=_state->ref_frame_data[refi]+frag_buf_off;
|
||||
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
||||
_state->frag_mvs[_fragi])>1){
|
||||
oc_frag_recon_inter2_arm(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
|
||||
_dct_coeffs+64);
|
||||
}
|
||||
else oc_frag_recon_inter_arm(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
|
||||
}
|
||||
}
|
||||
|
||||
# if defined(OC_ARM_ASM_MEDIA)
|
||||
void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||
unsigned char *dst;
|
||||
ptrdiff_t frag_buf_off;
|
||||
int ystride;
|
||||
int refi;
|
||||
/*Apply the inverse transform.*/
|
||||
/*Special case only having a DC component.*/
|
||||
if(_last_zzi<2){
|
||||
ogg_uint16_t p;
|
||||
/*We round this dequant product (and not any of the others) because there's
|
||||
no iDCT rounding.*/
|
||||
p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
|
||||
oc_idct8x8_1_v6(_dct_coeffs+64,p);
|
||||
}
|
||||
else{
|
||||
/*First, dequantize the DC coefficient.*/
|
||||
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
||||
oc_idct8x8_v6(_dct_coeffs+64,_dct_coeffs,_last_zzi);
|
||||
}
|
||||
/*Fill in the target buffer.*/
|
||||
frag_buf_off=_state->frag_buf_offs[_fragi];
|
||||
refi=_state->frags[_fragi].refi;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
|
||||
if(refi==OC_FRAME_SELF)oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64);
|
||||
else{
|
||||
const unsigned char *ref;
|
||||
int mvoffsets[2];
|
||||
ref=_state->ref_frame_data[refi]+frag_buf_off;
|
||||
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
||||
_state->frag_mvs[_fragi])>1){
|
||||
oc_frag_recon_inter2_v6(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
|
||||
_dct_coeffs+64);
|
||||
}
|
||||
else oc_frag_recon_inter_v6(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
|
||||
}
|
||||
}
|
||||
|
||||
# if defined(OC_ARM_ASM_NEON)
|
||||
void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||
unsigned char *dst;
|
||||
ptrdiff_t frag_buf_off;
|
||||
int ystride;
|
||||
int refi;
|
||||
/*Apply the inverse transform.*/
|
||||
/*Special case only having a DC component.*/
|
||||
if(_last_zzi<2){
|
||||
ogg_uint16_t p;
|
||||
/*We round this dequant product (and not any of the others) because there's
|
||||
no iDCT rounding.*/
|
||||
p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
|
||||
oc_idct8x8_1_neon(_dct_coeffs+64,p);
|
||||
}
|
||||
else{
|
||||
/*First, dequantize the DC coefficient.*/
|
||||
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
||||
oc_idct8x8_neon(_dct_coeffs+64,_dct_coeffs,_last_zzi);
|
||||
}
|
||||
/*Fill in the target buffer.*/
|
||||
frag_buf_off=_state->frag_buf_offs[_fragi];
|
||||
refi=_state->frags[_fragi].refi;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
|
||||
if(refi==OC_FRAME_SELF)oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64);
|
||||
else{
|
||||
const unsigned char *ref;
|
||||
int mvoffsets[2];
|
||||
ref=_state->ref_frame_data[refi]+frag_buf_off;
|
||||
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
||||
_state->frag_mvs[_fragi])>1){
|
||||
oc_frag_recon_inter2_neon(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
|
||||
_dct_coeffs+64);
|
||||
}
|
||||
else oc_frag_recon_inter_neon(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
# endif
|
||||
|
||||
#endif
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function: packing variable sized words into an octet stream
|
||||
last mod: $Id: bitpack.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: bitpack.c 17410 2010-09-21 21:53:48Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
#include <string.h>
|
||||
@ -32,15 +32,18 @@ static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
|
||||
const unsigned char *stop;
|
||||
oc_pb_window window;
|
||||
int available;
|
||||
unsigned shift;
|
||||
stop=_b->stop;
|
||||
ptr=_b->ptr;
|
||||
window=_b->window;
|
||||
available=_b->bits;
|
||||
ptr=_b->ptr;
|
||||
stop=_b->stop;
|
||||
while(available<=OC_PB_WINDOW_SIZE-8&&ptr<stop){
|
||||
available+=8;
|
||||
window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
|
||||
shift=OC_PB_WINDOW_SIZE-available;
|
||||
while(7<shift&&ptr<stop){
|
||||
shift-=8;
|
||||
window|=(oc_pb_window)*ptr++<<shift;
|
||||
}
|
||||
_b->ptr=ptr;
|
||||
available=OC_PB_WINDOW_SIZE-shift;
|
||||
if(_bits>available){
|
||||
if(ptr>=stop){
|
||||
_b->eof=1;
|
||||
@ -67,7 +70,7 @@ void oc_pack_adv1(oc_pack_buf *_b){
|
||||
}
|
||||
|
||||
/*Here we assume that 0<=_bits&&_bits<=32.*/
|
||||
long oc_pack_read(oc_pack_buf *_b,int _bits){
|
||||
long oc_pack_read_c(oc_pack_buf *_b,int _bits){
|
||||
oc_pb_window window;
|
||||
int available;
|
||||
long result;
|
||||
@ -82,12 +85,12 @@ long oc_pack_read(oc_pack_buf *_b,int _bits){
|
||||
available-=_bits;
|
||||
window<<=1;
|
||||
window<<=_bits-1;
|
||||
_b->bits=available;
|
||||
_b->window=window;
|
||||
_b->bits=available;
|
||||
return result;
|
||||
}
|
||||
|
||||
int oc_pack_read1(oc_pack_buf *_b){
|
||||
int oc_pack_read1_c(oc_pack_buf *_b){
|
||||
oc_pb_window window;
|
||||
int available;
|
||||
int result;
|
||||
@ -100,8 +103,8 @@ int oc_pack_read1(oc_pack_buf *_b){
|
||||
result=window>>OC_PB_WINDOW_SIZE-1;
|
||||
available--;
|
||||
window<<=1;
|
||||
_b->bits=available;
|
||||
_b->window=window;
|
||||
_b->bits=available;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -16,15 +16,32 @@
|
||||
********************************************************************/
|
||||
#if !defined(_bitpack_H)
|
||||
# define _bitpack_H (1)
|
||||
# include <stddef.h>
|
||||
# include <limits.h>
|
||||
# include "internal.h"
|
||||
|
||||
|
||||
|
||||
typedef unsigned long oc_pb_window;
|
||||
typedef size_t oc_pb_window;
|
||||
typedef struct oc_pack_buf oc_pack_buf;
|
||||
|
||||
|
||||
|
||||
/*Custom bitpacker implementations.*/
|
||||
# if defined(OC_ARM_ASM)
|
||||
# include "arm/armbits.h"
|
||||
# endif
|
||||
|
||||
# if !defined(oc_pack_read)
|
||||
# define oc_pack_read oc_pack_read_c
|
||||
# endif
|
||||
# if !defined(oc_pack_read1)
|
||||
# define oc_pack_read1 oc_pack_read1_c
|
||||
# endif
|
||||
# if !defined(oc_huff_token_decode)
|
||||
# define oc_huff_token_decode oc_huff_token_decode_c
|
||||
# endif
|
||||
|
||||
# define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT)
|
||||
/*This is meant to be a large, positive constant that can still be efficiently
|
||||
loaded as an immediate (on platforms like ARM, for example).
|
||||
@ -34,9 +51,9 @@ typedef struct oc_pack_buf oc_pack_buf;
|
||||
|
||||
|
||||
struct oc_pack_buf{
|
||||
oc_pb_window window;
|
||||
const unsigned char *ptr;
|
||||
const unsigned char *stop;
|
||||
const unsigned char *ptr;
|
||||
oc_pb_window window;
|
||||
int bits;
|
||||
int eof;
|
||||
};
|
||||
@ -45,8 +62,8 @@ void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes);
|
||||
int oc_pack_look1(oc_pack_buf *_b);
|
||||
void oc_pack_adv1(oc_pack_buf *_b);
|
||||
/*Here we assume 0<=_bits&&_bits<=32.*/
|
||||
long oc_pack_read(oc_pack_buf *_b,int _bits);
|
||||
int oc_pack_read1(oc_pack_buf *_b);
|
||||
long oc_pack_read_c(oc_pack_buf *_b,int _bits);
|
||||
int oc_pack_read1_c(oc_pack_buf *_b);
|
||||
/* returns -1 for read beyond EOF, or the number of whole bytes available */
|
||||
long oc_pack_bytes_left(oc_pack_buf *_b);
|
||||
|
||||
|
@ -32,7 +32,7 @@
|
||||
#define HAVE_STRING_H 1
|
||||
|
||||
/* Define to 1 if you have the <sys/soundcard.h> header file. */
|
||||
/* #undef HAVE_SYS_SOUNDCARD_H */
|
||||
#define HAVE_SYS_SOUNDCARD_H 1
|
||||
|
||||
/* Define to 1 if you have the <sys/stat.h> header file. */
|
||||
#define HAVE_SYS_STAT_H 1
|
||||
@ -43,18 +43,29 @@
|
||||
/* Define to 1 if you have the <unistd.h> header file. */
|
||||
#define HAVE_UNISTD_H 1
|
||||
|
||||
/* Define to the sub-directory in which libtool stores uninstalled libraries.
|
||||
*/
|
||||
#define LT_OBJDIR ".libs/"
|
||||
|
||||
/* Define to 1 if your C compiler doesn't accept -c and -o together. */
|
||||
/* #undef NO_MINUS_C_MINUS_O */
|
||||
|
||||
/* make use of arm asm optimization */
|
||||
|
||||
|
||||
/* Define if assembler supports EDSP instructions */
|
||||
|
||||
|
||||
/* Define if assembler supports ARMv6 media instructions */
|
||||
|
||||
|
||||
/* Define if compiler supports NEON instructions */
|
||||
|
||||
|
||||
/* make use of c64x+ asm optimization */
|
||||
/* #undef OC_C64X_ASM */
|
||||
|
||||
/* make use of x86_64 asm optimization */
|
||||
/* #undef OC_X86_64_ASM */
|
||||
|
||||
/* make use of x86 asm optimization */
|
||||
/**/
|
||||
/* #undef OC_X86_ASM */
|
||||
|
||||
/* Name of package */
|
||||
#define PACKAGE "libtheora"
|
||||
@ -66,16 +77,13 @@
|
||||
#define PACKAGE_NAME "libtheora"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "libtheora 1.1.1+svn"
|
||||
#define PACKAGE_STRING "libtheora 1.2.0alpha1+svn"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "libtheora"
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "1.1.1+svn"
|
||||
#define PACKAGE_VERSION "1.2.0alpha1+svn"
|
||||
|
||||
/* Define to 1 if you have the ANSI C header files. */
|
||||
#define STDC_HEADERS 1
|
||||
@ -87,4 +95,4 @@
|
||||
/* #undef THEORA_DISABLE_FLOAT */
|
||||
|
||||
/* Version number of package */
|
||||
#define VERSION "1.1.1+svn"
|
||||
#define VERSION "1.2.0alpha1+svn"
|
||||
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: decinfo.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: decinfo.c 17276 2010-06-05 05:57:05Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -128,6 +128,10 @@ static int oc_comment_unpack(oc_pack_buf *_opb,th_comment *_tc){
|
||||
_tc->comments*sizeof(_tc->comment_lengths[0]));
|
||||
_tc->user_comments=(char **)_ogg_malloc(
|
||||
_tc->comments*sizeof(_tc->user_comments[0]));
|
||||
if(_tc->comment_lengths==NULL||_tc->user_comments==NULL){
|
||||
_tc->comments=0;
|
||||
return TH_EFAULT;
|
||||
}
|
||||
for(i=0;i<_tc->comments;i++){
|
||||
len=oc_unpack_length(_opb);
|
||||
if(len<0||len>oc_pack_bytes_left(_opb)){
|
||||
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: decint.h 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: decint.h 17457 2010-09-24 02:05:49Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -19,15 +19,39 @@
|
||||
#if !defined(_decint_H)
|
||||
# define _decint_H (1)
|
||||
# include "theora/theoradec.h"
|
||||
# include "internal.h"
|
||||
# include "state.h"
|
||||
# include "bitpack.h"
|
||||
|
||||
typedef struct th_setup_info oc_setup_info;
|
||||
typedef struct th_dec_ctx oc_dec_ctx;
|
||||
|
||||
# include "huffdec.h"
|
||||
# include "dequant.h"
|
||||
|
||||
typedef struct th_setup_info oc_setup_info;
|
||||
typedef struct oc_dec_opt_vtable oc_dec_opt_vtable;
|
||||
typedef struct oc_dec_pipeline_state oc_dec_pipeline_state;
|
||||
typedef struct th_dec_ctx oc_dec_ctx;
|
||||
|
||||
|
||||
|
||||
/*Decoder-specific accelerated functions.*/
|
||||
# if defined(OC_C64X_ASM)
|
||||
# include "c64x/c64xdec.h"
|
||||
# endif
|
||||
|
||||
# if !defined(oc_dec_accel_init)
|
||||
# define oc_dec_accel_init oc_dec_accel_init_c
|
||||
# endif
|
||||
# if defined(OC_DEC_USE_VTABLE)
|
||||
# if !defined(oc_dec_dc_unpredict_mcu_plane)
|
||||
# define oc_dec_dc_unpredict_mcu_plane(_dec,_pipe,_pli) \
|
||||
((*(_dec)->opt_vtable.dc_unpredict_mcu_plane)(_dec,_pipe,_pli))
|
||||
# endif
|
||||
# else
|
||||
# if !defined(oc_dec_dc_unpredict_mcu_plane)
|
||||
# define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c
|
||||
# endif
|
||||
# endif
|
||||
|
||||
|
||||
|
||||
/*Constants for the packet-in state machine specific to the decoder.*/
|
||||
|
||||
/*Next packet to read: Data packet.*/
|
||||
@ -37,71 +61,126 @@ typedef struct th_dec_ctx oc_dec_ctx;
|
||||
|
||||
struct th_setup_info{
|
||||
/*The Huffman codes.*/
|
||||
oc_huff_node *huff_tables[TH_NHUFFMAN_TABLES];
|
||||
ogg_int16_t *huff_tables[TH_NHUFFMAN_TABLES];
|
||||
/*The quantization parameters.*/
|
||||
th_quant_info qinfo;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Decoder specific functions with accelerated variants.*/
|
||||
struct oc_dec_opt_vtable{
|
||||
void (*dc_unpredict_mcu_plane)(oc_dec_ctx *_dec,
|
||||
oc_dec_pipeline_state *_pipe,int _pli);
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct oc_dec_pipeline_state{
|
||||
/*Decoded DCT coefficients.
|
||||
These are placed here instead of on the stack so that they can persist
|
||||
between blocks, which makes clearing them back to zero much faster when
|
||||
only a few non-zero coefficients were decoded.
|
||||
It requires at least 65 elements because the zig-zag index array uses the
|
||||
65th element as a dumping ground for out-of-range indices to protect us
|
||||
from buffer overflow.
|
||||
We make it fully twice as large so that the second half can serve as the
|
||||
reconstruction buffer, which saves passing another parameter to all the
|
||||
acceleration functios.
|
||||
It also solves problems with 16-byte alignment for NEON on ARM.
|
||||
gcc (as of 4.2.1) only seems to be able to give stack variables 8-byte
|
||||
alignment, and silently produces incorrect results if you ask for 16.
|
||||
Finally, keeping it off the stack means there's less likely to be a data
|
||||
hazard beween the NEON co-processor and the regular ARM core, which avoids
|
||||
unnecessary stalls.*/
|
||||
OC_ALIGN16(ogg_int16_t dct_coeffs[128]);
|
||||
OC_ALIGN16(signed char bounding_values[256]);
|
||||
ptrdiff_t ti[3][64];
|
||||
ptrdiff_t ebi[3][64];
|
||||
ptrdiff_t eob_runs[3][64];
|
||||
const ptrdiff_t *coded_fragis[3];
|
||||
const ptrdiff_t *uncoded_fragis[3];
|
||||
ptrdiff_t ncoded_fragis[3];
|
||||
ptrdiff_t nuncoded_fragis[3];
|
||||
const ogg_uint16_t *dequant[3][3][2];
|
||||
int fragy0[3];
|
||||
int fragy_end[3];
|
||||
int pred_last[3][4];
|
||||
int mcu_nvfrags;
|
||||
int loop_filter;
|
||||
int pp_level;
|
||||
};
|
||||
|
||||
|
||||
struct th_dec_ctx{
|
||||
/*Shared encoder/decoder state.*/
|
||||
oc_theora_state state;
|
||||
oc_theora_state state;
|
||||
/*Whether or not packets are ready to be emitted.
|
||||
This takes on negative values while there are remaining header packets to
|
||||
be emitted, reaches 0 when the codec is ready for input, and goes to 1
|
||||
when a frame has been processed and a data packet is ready.*/
|
||||
int packet_state;
|
||||
int packet_state;
|
||||
/*Buffer in which to assemble packets.*/
|
||||
oc_pack_buf opb;
|
||||
oc_pack_buf opb;
|
||||
/*Huffman decode trees.*/
|
||||
oc_huff_node *huff_tables[TH_NHUFFMAN_TABLES];
|
||||
ogg_int16_t *huff_tables[TH_NHUFFMAN_TABLES];
|
||||
/*The index of the first token in each plane for each coefficient.*/
|
||||
ptrdiff_t ti0[3][64];
|
||||
ptrdiff_t ti0[3][64];
|
||||
/*The number of outstanding EOB runs at the start of each coefficient in each
|
||||
plane.*/
|
||||
ptrdiff_t eob_runs[3][64];
|
||||
ptrdiff_t eob_runs[3][64];
|
||||
/*The DCT token lists.*/
|
||||
unsigned char *dct_tokens;
|
||||
unsigned char *dct_tokens;
|
||||
/*The extra bits associated with DCT tokens.*/
|
||||
unsigned char *extra_bits;
|
||||
unsigned char *extra_bits;
|
||||
/*The number of dct tokens unpacked so far.*/
|
||||
int dct_tokens_count;
|
||||
int dct_tokens_count;
|
||||
/*The out-of-loop post-processing level.*/
|
||||
int pp_level;
|
||||
int pp_level;
|
||||
/*The DC scale used for out-of-loop deblocking.*/
|
||||
int pp_dc_scale[64];
|
||||
int pp_dc_scale[64];
|
||||
/*The sharpen modifier used for out-of-loop deringing.*/
|
||||
int pp_sharp_mod[64];
|
||||
int pp_sharp_mod[64];
|
||||
/*The DC quantization index of each block.*/
|
||||
unsigned char *dc_qis;
|
||||
unsigned char *dc_qis;
|
||||
/*The variance of each block.*/
|
||||
int *variances;
|
||||
int *variances;
|
||||
/*The storage for the post-processed frame buffer.*/
|
||||
unsigned char *pp_frame_data;
|
||||
unsigned char *pp_frame_data;
|
||||
/*Whether or not the post-processsed frame buffer has space for chroma.*/
|
||||
int pp_frame_state;
|
||||
int pp_frame_state;
|
||||
/*The buffer used for the post-processed frame.
|
||||
Note that this is _not_ guaranteed to have the same strides and offsets as
|
||||
the reference frame buffers.*/
|
||||
th_ycbcr_buffer pp_frame_buf;
|
||||
th_ycbcr_buffer pp_frame_buf;
|
||||
/*The striped decode callback function.*/
|
||||
th_stripe_callback stripe_cb;
|
||||
th_stripe_callback stripe_cb;
|
||||
oc_dec_pipeline_state pipe;
|
||||
# if defined(OC_DEC_USE_VTABLE)
|
||||
/*Table for decoder acceleration functions.*/
|
||||
oc_dec_opt_vtable opt_vtable;
|
||||
# endif
|
||||
# if defined(HAVE_CAIRO)
|
||||
/*Output metrics for debugging.*/
|
||||
int telemetry;
|
||||
int telemetry_mbmode;
|
||||
int telemetry_mv;
|
||||
int telemetry_qi;
|
||||
int telemetry_bits;
|
||||
int telemetry_frame_bytes;
|
||||
int telemetry_coding_bytes;
|
||||
int telemetry_mode_bytes;
|
||||
int telemetry_mv_bytes;
|
||||
int telemetry_qi_bytes;
|
||||
int telemetry_dc_bytes;
|
||||
unsigned char *telemetry_frame_data;
|
||||
int telemetry;
|
||||
int telemetry_mbmode;
|
||||
int telemetry_mv;
|
||||
int telemetry_qi;
|
||||
int telemetry_bits;
|
||||
int telemetry_frame_bytes;
|
||||
int telemetry_coding_bytes;
|
||||
int telemetry_mode_bytes;
|
||||
int telemetry_mv_bytes;
|
||||
int telemetry_qi_bytes;
|
||||
int telemetry_dc_bytes;
|
||||
unsigned char *telemetry_frame_data;
|
||||
# endif
|
||||
};
|
||||
|
||||
/*Default pure-C implementations of decoder-specific accelerated functions.*/
|
||||
void oc_dec_accel_init_c(oc_dec_ctx *_dec);
|
||||
|
||||
void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
|
||||
oc_dec_pipeline_state *_pipe,int _pli);
|
||||
|
||||
#endif
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,493 +0,0 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: encint.h 16503 2009-08-22 18:14:02Z giles $
|
||||
|
||||
********************************************************************/
|
||||
#if !defined(_encint_H)
|
||||
# define _encint_H (1)
|
||||
# if defined(HAVE_CONFIG_H)
|
||||
# include "config.h"
|
||||
# endif
|
||||
# include "theora/theoraenc.h"
|
||||
# include "internal.h"
|
||||
# include "ocintrin.h"
|
||||
# include "mathops.h"
|
||||
# include "enquant.h"
|
||||
# include "huffenc.h"
|
||||
/*# define OC_COLLECT_METRICS*/
|
||||
|
||||
|
||||
|
||||
typedef oc_mv oc_mv2[2];
|
||||
|
||||
typedef struct oc_enc_opt_vtable oc_enc_opt_vtable;
|
||||
typedef struct oc_mb_enc_info oc_mb_enc_info;
|
||||
typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
|
||||
typedef struct oc_iir_filter oc_iir_filter;
|
||||
typedef struct oc_frame_metrics oc_frame_metrics;
|
||||
typedef struct oc_rc_state oc_rc_state;
|
||||
typedef struct th_enc_ctx oc_enc_ctx;
|
||||
typedef struct oc_token_checkpoint oc_token_checkpoint;
|
||||
|
||||
|
||||
|
||||
/*Constants for the packet-out state machine specific to the encoder.*/
|
||||
|
||||
/*Next packet to emit: Data packet, but none are ready yet.*/
|
||||
#define OC_PACKET_EMPTY (0)
|
||||
/*Next packet to emit: Data packet, and one is ready.*/
|
||||
#define OC_PACKET_READY (1)
|
||||
|
||||
/*All features enabled.*/
|
||||
#define OC_SP_LEVEL_SLOW (0)
|
||||
/*Enable early skip.*/
|
||||
#define OC_SP_LEVEL_EARLY_SKIP (1)
|
||||
/*Disable motion compensation.*/
|
||||
#define OC_SP_LEVEL_NOMC (2)
|
||||
/*Maximum valid speed level.*/
|
||||
#define OC_SP_LEVEL_MAX (2)
|
||||
|
||||
|
||||
/*The bits used for each of the MB mode codebooks.*/
|
||||
extern const unsigned char OC_MODE_BITS[2][OC_NMODES];
|
||||
|
||||
/*The bits used for each of the MV codebooks.*/
|
||||
extern const unsigned char OC_MV_BITS[2][64];
|
||||
|
||||
/*The minimum value that can be stored in a SB run for each codeword.
|
||||
The last entry is the upper bound on the length of a single SB run.*/
|
||||
extern const ogg_uint16_t OC_SB_RUN_VAL_MIN[8];
|
||||
/*The bits used for each SB run codeword.*/
|
||||
extern const unsigned char OC_SB_RUN_CODE_NBITS[7];
|
||||
|
||||
/*The bits used for each block run length (starting with 1).*/
|
||||
extern const unsigned char OC_BLOCK_RUN_CODE_NBITS[30];
|
||||
|
||||
|
||||
|
||||
/*Encoder specific functions with accelerated variants.*/
|
||||
struct oc_enc_opt_vtable{
|
||||
unsigned (*frag_sad)(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned (*frag_sad_thresh)(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _thresh);
|
||||
unsigned (*frag_sad2_thresh)(const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||
unsigned _thresh);
|
||||
unsigned (*frag_satd_thresh)(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _thresh);
|
||||
unsigned (*frag_satd2_thresh)(const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||
unsigned _thresh);
|
||||
unsigned (*frag_intra_satd)(const unsigned char *_src,int _ystride);
|
||||
void (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
void (*frag_sub_128)(ogg_int16_t _diff[64],
|
||||
const unsigned char *_src,int _ystride);
|
||||
void (*frag_copy2)(unsigned char *_dst,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
||||
void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t _residue[64]);
|
||||
void (*frag_recon_inter)(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void (*fdct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||
};
|
||||
|
||||
|
||||
void oc_enc_vtable_init(oc_enc_ctx *_enc);
|
||||
|
||||
|
||||
|
||||
/*Encoder-specific macroblock information.*/
|
||||
struct oc_mb_enc_info{
|
||||
/*Neighboring macro blocks that have MVs available from the current frame.*/
|
||||
unsigned cneighbors[4];
|
||||
/*Neighboring macro blocks to use for MVs from the previous frame.*/
|
||||
unsigned pneighbors[4];
|
||||
/*The number of current-frame neighbors.*/
|
||||
unsigned char ncneighbors;
|
||||
/*The number of previous-frame neighbors.*/
|
||||
unsigned char npneighbors;
|
||||
/*Flags indicating which MB modes have been refined.*/
|
||||
unsigned char refined;
|
||||
/*Motion vectors for a macro block for the current frame and the
|
||||
previous two frames.
|
||||
Each is a set of 2 vectors against OC_FRAME_GOLD and OC_FRAME_PREV, which
|
||||
can be used to estimate constant velocity and constant acceleration
|
||||
predictors.
|
||||
Uninitialized MVs are (0,0).*/
|
||||
oc_mv2 analysis_mv[3];
|
||||
/*Current unrefined analysis MVs.*/
|
||||
oc_mv unref_mv[2];
|
||||
/*Unrefined block MVs.*/
|
||||
oc_mv block_mv[4];
|
||||
/*Refined block MVs.*/
|
||||
oc_mv ref_mv[4];
|
||||
/*Minimum motion estimation error from the analysis stage.*/
|
||||
ogg_uint16_t error[2];
|
||||
/*MB error for half-pel refinement for each frame type.*/
|
||||
unsigned satd[2];
|
||||
/*Block error for half-pel refinement.*/
|
||||
unsigned block_satd[4];
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*State machine to estimate the opportunity cost of coding a MB mode.*/
|
||||
struct oc_mode_scheme_chooser{
|
||||
/*Pointers to the a list containing the index of each mode in the mode
|
||||
alphabet used by each scheme.
|
||||
The first entry points to the dynamic scheme0_ranks, while the remaining 7
|
||||
point to the constant entries stored in OC_MODE_SCHEMES.*/
|
||||
const unsigned char *mode_ranks[8];
|
||||
/*The ranks for each mode when coded with scheme 0.
|
||||
These are optimized so that the more frequent modes have lower ranks.*/
|
||||
unsigned char scheme0_ranks[OC_NMODES];
|
||||
/*The list of modes, sorted in descending order of frequency, that
|
||||
corresponds to the ranks above.*/
|
||||
unsigned char scheme0_list[OC_NMODES];
|
||||
/*The number of times each mode has been chosen so far.*/
|
||||
int mode_counts[OC_NMODES];
|
||||
/*The list of mode coding schemes, sorted in ascending order of bit cost.*/
|
||||
unsigned char scheme_list[8];
|
||||
/*The number of bits used by each mode coding scheme.*/
|
||||
ptrdiff_t scheme_bits[8];
|
||||
};
|
||||
|
||||
|
||||
void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser);
|
||||
|
||||
|
||||
|
||||
/*A 2nd order low-pass Bessel follower.
|
||||
We use this for rate control because it has fast reaction time, but is
|
||||
critically damped.*/
|
||||
struct oc_iir_filter{
|
||||
ogg_int32_t c[2];
|
||||
ogg_int64_t g;
|
||||
ogg_int32_t x[2];
|
||||
ogg_int32_t y[2];
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*The 2-pass metrics associated with a single frame.*/
|
||||
struct oc_frame_metrics{
|
||||
/*The log base 2 of the scale factor for this frame in Q24 format.*/
|
||||
ogg_int32_t log_scale;
|
||||
/*The number of application-requested duplicates of this frame.*/
|
||||
unsigned dup_count:31;
|
||||
/*The frame type from pass 1.*/
|
||||
unsigned frame_type:1;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Rate control state information.*/
|
||||
struct oc_rc_state{
|
||||
/*The target average bits per frame.*/
|
||||
ogg_int64_t bits_per_frame;
|
||||
/*The current buffer fullness (bits available to be used).*/
|
||||
ogg_int64_t fullness;
|
||||
/*The target buffer fullness.
|
||||
This is where we'd like to be by the last keyframe the appears in the next
|
||||
buf_delay frames.*/
|
||||
ogg_int64_t target;
|
||||
/*The maximum buffer fullness (total size of the buffer).*/
|
||||
ogg_int64_t max;
|
||||
/*The log of the number of pixels in a frame in Q57 format.*/
|
||||
ogg_int64_t log_npixels;
|
||||
/*The exponent used in the rate model in Q8 format.*/
|
||||
unsigned exp[2];
|
||||
/*The number of frames to distribute the buffer usage over.*/
|
||||
int buf_delay;
|
||||
/*The total drop count from the previous frame.
|
||||
This includes duplicates explicitly requested via the
|
||||
TH_ENCCTL_SET_DUP_COUNT API as well as frames we chose to drop ourselves.*/
|
||||
ogg_uint32_t prev_drop_count;
|
||||
/*The log of an estimated scale factor used to obtain the real framerate, for
|
||||
VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/
|
||||
ogg_int64_t log_drop_scale;
|
||||
/*The log of estimated scale factor for the rate model in Q57 format.*/
|
||||
ogg_int64_t log_scale[2];
|
||||
/*The log of the target quantizer level in Q57 format.*/
|
||||
ogg_int64_t log_qtarget;
|
||||
/*Will we drop frames to meet bitrate target?*/
|
||||
unsigned char drop_frames;
|
||||
/*Do we respect the maximum buffer fullness?*/
|
||||
unsigned char cap_overflow;
|
||||
/*Can the reservoir go negative?*/
|
||||
unsigned char cap_underflow;
|
||||
/*Second-order lowpass filters to track scale and VFR.*/
|
||||
oc_iir_filter scalefilter[2];
|
||||
int inter_count;
|
||||
int inter_delay;
|
||||
int inter_delay_target;
|
||||
oc_iir_filter vfrfilter;
|
||||
/*Two-pass mode state.
|
||||
0 => 1-pass encoding.
|
||||
1 => 1st pass of 2-pass encoding.
|
||||
2 => 2nd pass of 2-pass encoding.*/
|
||||
int twopass;
|
||||
/*Buffer for current frame metrics.*/
|
||||
unsigned char twopass_buffer[48];
|
||||
/*The number of bytes in the frame metrics buffer.
|
||||
When 2-pass encoding is enabled, this is set to 0 after each frame is
|
||||
submitted, and must be non-zero before the next frame will be accepted.*/
|
||||
int twopass_buffer_bytes;
|
||||
int twopass_buffer_fill;
|
||||
/*Whether or not to force the next frame to be a keyframe.*/
|
||||
unsigned char twopass_force_kf;
|
||||
/*The metrics for the previous frame.*/
|
||||
oc_frame_metrics prev_metrics;
|
||||
/*The metrics for the current frame.*/
|
||||
oc_frame_metrics cur_metrics;
|
||||
/*The buffered metrics for future frames.*/
|
||||
oc_frame_metrics *frame_metrics;
|
||||
int nframe_metrics;
|
||||
int cframe_metrics;
|
||||
/*The index of the current frame in the circular metric buffer.*/
|
||||
int frame_metrics_head;
|
||||
/*The frame count of each type (keyframes, delta frames, and dup frames);
|
||||
32 bits limits us to 2.268 years at 60 fps.*/
|
||||
ogg_uint32_t frames_total[3];
|
||||
/*The number of frames of each type yet to be processed.*/
|
||||
ogg_uint32_t frames_left[3];
|
||||
/*The sum of the scale values for each frame type.*/
|
||||
ogg_int64_t scale_sum[2];
|
||||
/*The start of the window over which the current scale sums are taken.*/
|
||||
int scale_window0;
|
||||
/*The end of the window over which the current scale sums are taken.*/
|
||||
int scale_window_end;
|
||||
/*The frame count of each type in the current 2-pass window; this does not
|
||||
include dup frames.*/
|
||||
int nframes[3];
|
||||
/*The total accumulated estimation bias.*/
|
||||
ogg_int64_t rate_bias;
|
||||
};
|
||||
|
||||
|
||||
void oc_rc_state_init(oc_rc_state *_rc,oc_enc_ctx *_enc);
|
||||
void oc_rc_state_clear(oc_rc_state *_rc);
|
||||
|
||||
void oc_enc_rc_resize(oc_enc_ctx *_enc);
|
||||
int oc_enc_select_qi(oc_enc_ctx *_enc,int _qti,int _clamp);
|
||||
void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _frame_type);
|
||||
int oc_enc_update_rc_state(oc_enc_ctx *_enc,
|
||||
long _bits,int _qti,int _qi,int _trial,int _droppable);
|
||||
int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf);
|
||||
int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes);
|
||||
|
||||
|
||||
|
||||
/*The internal encoder state.*/
|
||||
struct th_enc_ctx{
|
||||
/*Shared encoder/decoder state.*/
|
||||
oc_theora_state state;
|
||||
/*Buffer in which to assemble packets.*/
|
||||
oggpack_buffer opb;
|
||||
/*Encoder-specific macroblock information.*/
|
||||
oc_mb_enc_info *mb_info;
|
||||
/*DC coefficients after prediction.*/
|
||||
ogg_int16_t *frag_dc;
|
||||
/*The list of coded macro blocks, in coded order.*/
|
||||
unsigned *coded_mbis;
|
||||
/*The number of coded macro blocks.*/
|
||||
size_t ncoded_mbis;
|
||||
/*Whether or not packets are ready to be emitted.
|
||||
This takes on negative values while there are remaining header packets to
|
||||
be emitted, reaches 0 when the codec is ready for input, and becomes
|
||||
positive when a frame has been processed and data packets are ready.*/
|
||||
int packet_state;
|
||||
/*The maximum distance between keyframes.*/
|
||||
ogg_uint32_t keyframe_frequency_force;
|
||||
/*The number of duplicates to produce for the next frame.*/
|
||||
ogg_uint32_t dup_count;
|
||||
/*The number of duplicates remaining to be emitted for the current frame.*/
|
||||
ogg_uint32_t nqueued_dups;
|
||||
/*The number of duplicates emitted for the last frame.*/
|
||||
ogg_uint32_t prev_dup_count;
|
||||
/*The current speed level.*/
|
||||
int sp_level;
|
||||
/*Whether or not VP3 compatibility mode has been enabled.*/
|
||||
unsigned char vp3_compatible;
|
||||
/*Whether or not any INTER frames have been coded.*/
|
||||
unsigned char coded_inter_frame;
|
||||
/*Whether or not previous frame was dropped.*/
|
||||
unsigned char prevframe_dropped;
|
||||
/*Stores most recently chosen Huffman tables for each frame type, DC and AC
|
||||
coefficients, and luma and chroma tokens.
|
||||
The actual Huffman table used for a given coefficient depends not only on
|
||||
the choice made here, but also its index in the zig-zag ordering.*/
|
||||
unsigned char huff_idxs[2][2][2];
|
||||
/*Current count of bits used by each MV coding mode.*/
|
||||
size_t mv_bits[2];
|
||||
/*The mode scheme chooser for estimating mode coding costs.*/
|
||||
oc_mode_scheme_chooser chooser;
|
||||
/*The number of vertical super blocks in an MCU.*/
|
||||
int mcu_nvsbs;
|
||||
/*The SSD error for skipping each fragment in the current MCU.*/
|
||||
unsigned *mcu_skip_ssd;
|
||||
/*The DCT token lists for each coefficient and each plane.*/
|
||||
unsigned char **dct_tokens[3];
|
||||
/*The extra bits associated with each DCT token.*/
|
||||
ogg_uint16_t **extra_bits[3];
|
||||
/*The number of DCT tokens for each coefficient for each plane.*/
|
||||
ptrdiff_t ndct_tokens[3][64];
|
||||
/*Pending EOB runs for each coefficient for each plane.*/
|
||||
ogg_uint16_t eob_run[3][64];
|
||||
/*The offset of the first DCT token for each coefficient for each plane.*/
|
||||
unsigned char dct_token_offs[3][64];
|
||||
/*The last DC coefficient for each plane and reference frame.*/
|
||||
int dc_pred_last[3][3];
|
||||
#if defined(OC_COLLECT_METRICS)
|
||||
/*Fragment SATD statistics for MB mode estimation metrics.*/
|
||||
unsigned *frag_satd;
|
||||
/*Fragment SSD statistics for MB mode estimation metrics.*/
|
||||
unsigned *frag_ssd;
|
||||
#endif
|
||||
/*The R-D optimization parameter.*/
|
||||
int lambda;
|
||||
/*The huffman tables in use.*/
|
||||
th_huff_code huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
|
||||
/*The quantization parameters in use.*/
|
||||
th_quant_info qinfo;
|
||||
oc_iquant *enquant_tables[64][3][2];
|
||||
oc_iquant_table enquant_table_data[64][3][2];
|
||||
/*An "average" quantizer for each quantizer type (INTRA or INTER) and qi
|
||||
value.
|
||||
This is used to paramterize the rate control decisions.
|
||||
They are kept in the log domain to simplify later processing.
|
||||
Keep in mind these are DCT domain quantizers, and so are scaled by an
|
||||
additional factor of 4 from the pixel domain.*/
|
||||
ogg_int64_t log_qavg[2][64];
|
||||
/*The buffer state used to drive rate control.*/
|
||||
oc_rc_state rc;
|
||||
/*Table for encoder acceleration functions.*/
|
||||
oc_enc_opt_vtable opt_vtable;
|
||||
};
|
||||
|
||||
|
||||
void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode);
|
||||
int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode);
|
||||
#if defined(OC_COLLECT_METRICS)
|
||||
void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc);
|
||||
void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc);
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/*Perform fullpel motion search for a single MB against both reference frames.*/
|
||||
void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi);
|
||||
/*Refine a MB MV for one frame.*/
|
||||
void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame);
|
||||
/*Refine the block MVs.*/
|
||||
void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi);
|
||||
|
||||
|
||||
|
||||
/*Used to rollback a tokenlog transaction when we retroactively decide to skip
|
||||
a fragment.
|
||||
A checkpoint is taken right before each token is added.*/
|
||||
struct oc_token_checkpoint{
|
||||
/*The color plane the token was added to.*/
|
||||
unsigned char pli;
|
||||
/*The zig-zag index the token was added to.*/
|
||||
unsigned char zzi;
|
||||
/*The outstanding EOB run count before the token was added.*/
|
||||
ogg_uint16_t eob_run;
|
||||
/*The token count before the token was added.*/
|
||||
ptrdiff_t ndct_tokens;
|
||||
};
|
||||
|
||||
|
||||
|
||||
void oc_enc_tokenize_start(oc_enc_ctx *_enc);
|
||||
int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
|
||||
ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
|
||||
int _zzi,oc_token_checkpoint **_stack,int _acmin);
|
||||
void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
|
||||
const oc_token_checkpoint *_stack,int _n);
|
||||
void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
|
||||
int _pli,int _fragy0,int _frag_yend);
|
||||
void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
|
||||
const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
|
||||
int _prev_ndct_tokens1,int _prev_eob_run1);
|
||||
void oc_enc_tokenize_finish(oc_enc_ctx *_enc);
|
||||
|
||||
|
||||
|
||||
/*Utility routine to encode one of the header packets.*/
|
||||
int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
|
||||
oggpack_buffer *_opb,const th_quant_info *_qinfo,
|
||||
const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS],
|
||||
const char *_vendor,th_comment *_tc,ogg_packet *_op);
|
||||
|
||||
|
||||
|
||||
/*Encoder-specific accelerated functions.*/
|
||||
void oc_enc_frag_sub(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
|
||||
const unsigned char *_src,const unsigned char *_ref,int _ystride);
|
||||
void oc_enc_frag_sub_128(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
|
||||
const unsigned char *_src,int _ystride);
|
||||
unsigned oc_enc_frag_sad(const oc_enc_ctx *_enc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_sad_thresh(const oc_enc_ctx *_enc,
|
||||
const unsigned char *_src,const unsigned char *_ref,int _ystride,
|
||||
unsigned _thresh);
|
||||
unsigned oc_enc_frag_sad2_thresh(const oc_enc_ctx *_enc,
|
||||
const unsigned char *_src,const unsigned char *_ref1,
|
||||
const unsigned char *_ref2,int _ystride,unsigned _thresh);
|
||||
unsigned oc_enc_frag_satd_thresh(const oc_enc_ctx *_enc,
|
||||
const unsigned char *_src,const unsigned char *_ref,int _ystride,
|
||||
unsigned _thresh);
|
||||
unsigned oc_enc_frag_satd2_thresh(const oc_enc_ctx *_enc,
|
||||
const unsigned char *_src,const unsigned char *_ref1,
|
||||
const unsigned char *_ref2,int _ystride,unsigned _thresh);
|
||||
unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc,
|
||||
const unsigned char *_src,int _ystride);
|
||||
void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
||||
void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
|
||||
unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void oc_enc_fdct8x8(const oc_enc_ctx *_enc,ogg_int16_t _y[64],
|
||||
const ogg_int16_t _x[64]);
|
||||
|
||||
/*Default pure-C implementations.*/
|
||||
void oc_enc_vtable_init_c(oc_enc_ctx *_enc);
|
||||
|
||||
void oc_enc_frag_sub_c(ogg_int16_t _diff[64],
|
||||
const unsigned char *_src,const unsigned char *_ref,int _ystride);
|
||||
void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
|
||||
const unsigned char *_src,int _ystride);
|
||||
void oc_enc_frag_copy2_c(unsigned char *_dst,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
||||
unsigned oc_enc_frag_sad_c(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _thresh);
|
||||
unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||
unsigned _thresh);
|
||||
unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _thresh);
|
||||
unsigned oc_enc_frag_satd2_thresh_c(const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||
unsigned _thresh);
|
||||
unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride);
|
||||
void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||
|
||||
#endif
|
@ -1,67 +0,0 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: encoder_disabled.c 16503 2009-08-22 18:14:02Z giles $
|
||||
|
||||
********************************************************************/
|
||||
#include "apiwrapper.h"
|
||||
#include "encint.h"
|
||||
|
||||
th_enc_ctx *th_encode_alloc(const th_info *_info){
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void th_encode_free(th_enc_ctx *_enc){}
|
||||
|
||||
|
||||
int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int th_encode_flushheader(th_enc_ctx *_enc,th_comment *_tc,ogg_packet *_op){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int theora_encode_init(theora_state *_te,theora_info *_ci){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int theora_encode_YUVin(theora_state *_te,yuv_buffer *_yuv){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int theora_encode_packetout(theora_state *_te,int _last_p,ogg_packet *_op){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int theora_encode_header(theora_state *_te,ogg_packet *_op){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int theora_encode_comment(theora_comment *_tc,ogg_packet *_op){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int theora_encode_tables(theora_state *_te,ogg_packet *_op){
|
||||
return OC_DISABLED;
|
||||
}
|
@ -1,27 +0,0 @@
|
||||
#if !defined(_enquant_H)
|
||||
# define _enquant_H (1)
|
||||
# include "quant.h"
|
||||
|
||||
typedef struct oc_iquant oc_iquant;
|
||||
|
||||
#define OC_QUANT_MAX_LOG (OC_Q57(OC_STATIC_ILOG_32(OC_QUANT_MAX)-1))
|
||||
|
||||
/*Used to compute x/d via ((x*m>>16)+x>>l)+(x<0))
|
||||
(i.e., one 16x16->16 mul, 2 shifts, and 2 adds).
|
||||
This is not an approximation; for 16-bit x and d, it is exact.*/
|
||||
struct oc_iquant{
|
||||
ogg_int16_t m;
|
||||
ogg_int16_t l;
|
||||
};
|
||||
|
||||
typedef oc_iquant oc_iquant_table[64];
|
||||
|
||||
|
||||
|
||||
void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo);
|
||||
void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2],
|
||||
oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo);
|
||||
void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
|
||||
ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt);
|
||||
|
||||
#endif
|
@ -11,17 +11,12 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: fragment.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: fragment.c 17410 2010-09-21 21:53:48Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
#include <string.h>
|
||||
#include "internal.h"
|
||||
|
||||
void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride){
|
||||
(*_state->opt_vtable.frag_copy)(_dst,_src,_ystride);
|
||||
}
|
||||
|
||||
void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){
|
||||
int i;
|
||||
for(i=8;i-->0;){
|
||||
@ -31,9 +26,24 @@ void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){
|
||||
}
|
||||
}
|
||||
|
||||
void oc_frag_recon_intra(const oc_theora_state *_state,unsigned char *_dst,
|
||||
int _ystride,const ogg_int16_t _residue[64]){
|
||||
_state->opt_vtable.frag_recon_intra(_dst,_ystride,_residue);
|
||||
/*Copies the fragments specified by the lists of fragment indices from one
|
||||
frame to another.
|
||||
_dst_frame: The reference frame to copy to.
|
||||
_src_frame: The reference frame to copy from.
|
||||
_ystride: The row stride of the reference frames.
|
||||
_fragis: A pointer to a list of fragment indices.
|
||||
_nfragis: The number of fragment indices to copy.
|
||||
_frag_buf_offs: The offsets of fragments in the reference frames.*/
|
||||
void oc_frag_copy_list_c(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
|
||||
ptrdiff_t fragii;
|
||||
for(fragii=0;fragii<_nfragis;fragii++){
|
||||
ptrdiff_t frag_buf_off;
|
||||
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
|
||||
oc_frag_copy_c(_dst_frame+frag_buf_off,
|
||||
_src_frame+frag_buf_off,_ystride);
|
||||
}
|
||||
}
|
||||
|
||||
void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
|
||||
@ -46,11 +56,6 @@ void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
|
||||
}
|
||||
}
|
||||
|
||||
void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
|
||||
_state->opt_vtable.frag_recon_inter(_dst,_src,_ystride,_residue);
|
||||
}
|
||||
|
||||
void oc_frag_recon_inter_c(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
|
||||
int i;
|
||||
@ -62,12 +67,6 @@ void oc_frag_recon_inter_c(unsigned char *_dst,
|
||||
}
|
||||
}
|
||||
|
||||
void oc_frag_recon_inter2(const oc_theora_state *_state,unsigned char *_dst,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride,
|
||||
const ogg_int16_t _residue[64]){
|
||||
_state->opt_vtable.frag_recon_inter2(_dst,_src1,_src2,_ystride,_residue);
|
||||
}
|
||||
|
||||
void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]){
|
||||
int i;
|
||||
@ -80,8 +79,4 @@ void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
|
||||
}
|
||||
}
|
||||
|
||||
void oc_restore_fpu(const oc_theora_state *_state){
|
||||
_state->opt_vtable.restore_fpu();
|
||||
}
|
||||
|
||||
void oc_restore_fpu_c(void){}
|
||||
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: huffdec.c 16702 2009-11-15 00:40:55Z tterribe $
|
||||
last mod: $Id: huffdec.c 17577 2010-10-29 04:00:07Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -22,14 +22,60 @@
|
||||
#include "decint.h"
|
||||
|
||||
|
||||
/*The ANSI offsetof macro is broken on some platforms (e.g., older DECs).*/
|
||||
#define _ogg_offsetof(_type,_field)\
|
||||
((size_t)((char *)&((_type *)0)->_field-(char *)0))
|
||||
|
||||
/*The number of internal tokens associated with each of the spec tokens.*/
|
||||
static const unsigned char OC_DCT_TOKEN_MAP_ENTRIES[TH_NDCT_TOKENS]={
|
||||
1,1,1,4,8,1,1,8,1,1,1,1,1,2,2,2,2,4,8,2,2,2,4,2,2,2,2,2,8,2,4,8
|
||||
};
|
||||
/*Instead of storing every branching in the tree, subtrees can be collapsed
|
||||
into one node, with a table of size 1<<nbits pointing directly to its
|
||||
descedents nbits levels down.
|
||||
This allows more than one bit to be read at a time, and avoids following all
|
||||
the intermediate branches with next to no increased code complexity once
|
||||
the collapsed tree has been built.
|
||||
We do _not_ require that a subtree be complete to be collapsed, but instead
|
||||
store duplicate pointers in the table, and record the actual depth of the
|
||||
node below its parent.
|
||||
This tells us the number of bits to advance the stream after reaching it.
|
||||
|
||||
This turns out to be equivalent to the method described in \cite{Hash95},
|
||||
without the requirement that codewords be sorted by length.
|
||||
If the codewords were sorted by length (so-called ``canonical-codes''), they
|
||||
could be decoded much faster via either Lindell and Moffat's approach or
|
||||
Hashemian's Condensed Huffman Code approach, the latter of which has an
|
||||
extremely small memory footprint.
|
||||
We can't use Choueka et al.'s finite state machine approach, which is
|
||||
extremely fast, because we can't allow multiple symbols to be output at a
|
||||
time; the codebook can and does change between symbols.
|
||||
It also has very large memory requirements, which impairs cache coherency.
|
||||
|
||||
We store the tree packed in an array of 16-bit integers (words).
|
||||
Each node consists of a single word, followed consecutively by two or more
|
||||
indices of its children.
|
||||
Let n be the value of this first word.
|
||||
This is the number of bits that need to be read to traverse the node, and
|
||||
must be positive.
|
||||
1<<n entries follow in the array, each an index to a child node.
|
||||
If the child is positive, then it is the index of another internal node in
|
||||
the table.
|
||||
If the child is negative or zero, then it is a leaf node.
|
||||
These are stored directly in the child pointer to save space, since they only
|
||||
require a single word.
|
||||
If a leaf node would have been encountered before reading n bits, then it is
|
||||
duplicated the necessary number of times in this table.
|
||||
Leaf nodes pack both a token value and their actual depth in the tree.
|
||||
The token in the leaf node is (-leaf&255).
|
||||
The number of bits that need to be consumed to reach the leaf, starting from
|
||||
the current node, is (-leaf>>8).
|
||||
|
||||
@ARTICLE{Hash95,
|
||||
author="Reza Hashemian",
|
||||
title="Memory Efficient and High-Speed Search {Huffman} Coding",
|
||||
journal="{IEEE} Transactions on Communications",
|
||||
volume=43,
|
||||
number=10,
|
||||
pages="2576--2581",
|
||||
month=Oct,
|
||||
year=1995
|
||||
}*/
|
||||
|
||||
|
||||
|
||||
/*The map from external spec-defined tokens to internal tokens.
|
||||
This is constructed so that any extra bits read with the original token value
|
||||
@ -99,391 +145,371 @@ static const unsigned char OC_DCT_TOKEN_MAP[TH_NDCT_TOKENS]={
|
||||
40
|
||||
};
|
||||
|
||||
/*These three functions are really part of the bitpack.c module, but
|
||||
they are only used here.
|
||||
Declaring local static versions so they can be inlined saves considerable
|
||||
function call overhead.*/
|
||||
|
||||
static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
|
||||
const unsigned char *ptr;
|
||||
const unsigned char *stop;
|
||||
oc_pb_window window;
|
||||
int available;
|
||||
window=_b->window;
|
||||
available=_b->bits;
|
||||
ptr=_b->ptr;
|
||||
stop=_b->stop;
|
||||
/*This version of _refill() doesn't bother setting eof because we won't
|
||||
check for it after we've started decoding DCT tokens.*/
|
||||
if(ptr>=stop)available=OC_LOTS_OF_BITS;
|
||||
while(available<=OC_PB_WINDOW_SIZE-8){
|
||||
available+=8;
|
||||
window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
|
||||
if(ptr>=stop)available=OC_LOTS_OF_BITS;
|
||||
}
|
||||
_b->ptr=ptr;
|
||||
if(_bits>available)window|=*ptr>>(available&7);
|
||||
_b->bits=available;
|
||||
return window;
|
||||
}
|
||||
/*The log base 2 of number of internal tokens associated with each of the spec
|
||||
tokens (i.e., how many of the extra bits are folded into the token value).
|
||||
Increasing the maximum value beyond 3 will enlarge the amount of stack
|
||||
required for tree construction.*/
|
||||
static const unsigned char OC_DCT_TOKEN_MAP_LOG_NENTRIES[TH_NDCT_TOKENS]={
|
||||
0,0,0,2,3,0,0,3,0,0,0,0,0,1,1,1,1,2,3,1,1,1,2,1,1,1,1,1,3,1,2,3
|
||||
};
|
||||
|
||||
|
||||
/*Read in bits without advancing the bit pointer.
|
||||
Here we assume 0<=_bits&&_bits<=32.*/
|
||||
static long oc_pack_look(oc_pack_buf *_b,int _bits){
|
||||
oc_pb_window window;
|
||||
int available;
|
||||
long result;
|
||||
window=_b->window;
|
||||
available=_b->bits;
|
||||
if(_bits==0)return 0;
|
||||
if(_bits>available)_b->window=window=oc_pack_refill(_b,_bits);
|
||||
result=window>>OC_PB_WINDOW_SIZE-_bits;
|
||||
return result;
|
||||
}
|
||||
|
||||
/*Advance the bit pointer.*/
|
||||
static void oc_pack_adv(oc_pack_buf *_b,int _bits){
|
||||
/*We ignore the special cases for _bits==0 and _bits==32 here, since they are
|
||||
never used actually used.
|
||||
OC_HUFF_SLUSH (defined below) would have to be at least 27 to actually read
|
||||
32 bits in a single go, and would require a 32 GB lookup table (assuming
|
||||
8 byte pointers, since 4 byte pointers couldn't fit such a table).*/
|
||||
_b->window<<=_bits;
|
||||
_b->bits-=_bits;
|
||||
}
|
||||
|
||||
|
||||
/*The log_2 of the size of a lookup table is allowed to grow to relative to
|
||||
the number of unique nodes it contains.
|
||||
E.g., if OC_HUFF_SLUSH is 2, then at most 75% of the space in the tree is
|
||||
wasted (each node will have an amortized cost of at most 20 bytes when using
|
||||
4-byte pointers).
|
||||
/*The size a lookup table is allowed to grow to relative to the number of
|
||||
unique nodes it contains.
|
||||
E.g., if OC_HUFF_SLUSH is 4, then at most 75% of the space in the tree is
|
||||
wasted (1/4 of the space must be used).
|
||||
Larger numbers can decode tokens with fewer read operations, while smaller
|
||||
numbers may save more space (requiring as little as 8 bytes amortized per
|
||||
node, though there will be more nodes).
|
||||
numbers may save more space.
|
||||
With a sample file:
|
||||
32233473 read calls are required when no tree collapsing is done (100.0%).
|
||||
19269269 read calls are required when OC_HUFF_SLUSH is 0 (59.8%).
|
||||
11144969 read calls are required when OC_HUFF_SLUSH is 1 (34.6%).
|
||||
10538563 read calls are required when OC_HUFF_SLUSH is 2 (32.7%).
|
||||
10192578 read calls are required when OC_HUFF_SLUSH is 3 (31.6%).
|
||||
Since a value of 1 gets us the vast majority of the speed-up with only a
|
||||
small amount of wasted memory, this is what we use.*/
|
||||
#define OC_HUFF_SLUSH (1)
|
||||
19269269 read calls are required when OC_HUFF_SLUSH is 1 (59.8%).
|
||||
11144969 read calls are required when OC_HUFF_SLUSH is 2 (34.6%).
|
||||
10538563 read calls are required when OC_HUFF_SLUSH is 4 (32.7%).
|
||||
10192578 read calls are required when OC_HUFF_SLUSH is 8 (31.6%).
|
||||
Since a value of 2 gets us the vast majority of the speed-up with only a
|
||||
small amount of wasted memory, this is what we use.
|
||||
This value must be less than 128, or you could create a tree with more than
|
||||
32767 entries, which would overflow the 16-bit words used to index it.*/
|
||||
#define OC_HUFF_SLUSH (2)
|
||||
/*The root of the tree is on the fast path, and a larger value here is more
|
||||
beneficial than elsewhere in the tree.
|
||||
7 appears to give the best performance, trading off between increased use of
|
||||
the single-read fast path and cache footprint for the tables, though
|
||||
obviously this will depend on your cache size.
|
||||
Using 7 here, the VP3 tables are about twice as large compared to using 2.*/
|
||||
#define OC_ROOT_HUFF_SLUSH (7)
|
||||
|
||||
|
||||
/*Determines the size in bytes of a Huffman tree node that represents a
|
||||
|
||||
/*Unpacks a Huffman codebook.
|
||||
_opb: The buffer to unpack from.
|
||||
_tokens: Stores a list of internal tokens, in the order they were found in
|
||||
the codebook, and the lengths of their corresponding codewords.
|
||||
This is enough to completely define the codebook, while minimizing
|
||||
stack usage and avoiding temporary allocations (for platforms
|
||||
where free() is a no-op).
|
||||
Return: The number of internal tokens in the codebook, or a negative value
|
||||
on error.*/
|
||||
int oc_huff_tree_unpack(oc_pack_buf *_opb,unsigned char _tokens[256][2]){
|
||||
ogg_uint32_t code;
|
||||
int len;
|
||||
int ntokens;
|
||||
int nleaves;
|
||||
code=0;
|
||||
len=ntokens=nleaves=0;
|
||||
for(;;){
|
||||
long bits;
|
||||
bits=oc_pack_read1(_opb);
|
||||
/*Only process nodes so long as there's more bits in the buffer.*/
|
||||
if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
|
||||
/*Read an internal node:*/
|
||||
if(!bits){
|
||||
len++;
|
||||
/*Don't allow codewords longer than 32 bits.*/
|
||||
if(len>32)return TH_EBADHEADER;
|
||||
}
|
||||
/*Read a leaf node:*/
|
||||
else{
|
||||
ogg_uint32_t code_bit;
|
||||
int neb;
|
||||
int nentries;
|
||||
int token;
|
||||
/*Don't allow more than 32 spec-tokens per codebook.*/
|
||||
if(++nleaves>32)return TH_EBADHEADER;
|
||||
bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
|
||||
neb=OC_DCT_TOKEN_MAP_LOG_NENTRIES[bits];
|
||||
token=OC_DCT_TOKEN_MAP[bits];
|
||||
nentries=1<<neb;
|
||||
while(nentries-->0){
|
||||
_tokens[ntokens][0]=(unsigned char)token++;
|
||||
_tokens[ntokens][1]=(unsigned char)(len+neb);
|
||||
ntokens++;
|
||||
}
|
||||
code_bit=0x80000000U>>len-1;
|
||||
while(len>0&&(code&code_bit)){
|
||||
code^=code_bit;
|
||||
code_bit<<=1;
|
||||
len--;
|
||||
}
|
||||
if(len<=0)break;
|
||||
code|=code_bit;
|
||||
}
|
||||
}
|
||||
return ntokens;
|
||||
}
|
||||
|
||||
/*Count how many tokens would be required to fill a subtree at depth _depth.
|
||||
_tokens: A list of internal tokens, in the order they are found in the
|
||||
codebook, and the lengths of their corresponding codewords.
|
||||
_depth: The depth of the desired node in the corresponding tree structure.
|
||||
Return: The number of tokens that belong to that subtree.*/
|
||||
static int oc_huff_subtree_tokens(unsigned char _tokens[][2],int _depth){
|
||||
ogg_uint32_t code;
|
||||
int ti;
|
||||
code=0;
|
||||
ti=0;
|
||||
do{
|
||||
if(_tokens[ti][1]-_depth<32)code+=0x80000000U>>_tokens[ti++][1]-_depth;
|
||||
else{
|
||||
/*Because of the expanded internal tokens, we can have codewords as long
|
||||
as 35 bits.
|
||||
A single recursion here is enough to advance past them.*/
|
||||
code++;
|
||||
ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+31);
|
||||
}
|
||||
}
|
||||
while(code<0x80000000U);
|
||||
return ti;
|
||||
}
|
||||
|
||||
/*Compute the number of bits to use for a collapsed tree node at the given
|
||||
depth.
|
||||
_tokens: A list of internal tokens, in the order they are found in the
|
||||
codebook, and the lengths of their corresponding codewords.
|
||||
_ntokens: The number of tokens corresponding to this tree node.
|
||||
_depth: The depth of this tree node.
|
||||
Return: The number of bits to use for a collapsed tree node rooted here.
|
||||
This is always at least one, even if this was a leaf node.*/
|
||||
static int oc_huff_tree_collapse_depth(unsigned char _tokens[][2],
|
||||
int _ntokens,int _depth){
|
||||
int got_leaves;
|
||||
int loccupancy;
|
||||
int occupancy;
|
||||
int slush;
|
||||
int nbits;
|
||||
int best_nbits;
|
||||
slush=_depth>0?OC_HUFF_SLUSH:OC_ROOT_HUFF_SLUSH;
|
||||
/*It's legal to have a tree with just a single node, which requires no bits
|
||||
to decode and always returns the same token.
|
||||
However, no encoder actually does this (yet).
|
||||
To avoid a special case in oc_huff_token_decode(), we force the number of
|
||||
lookahead bits to be at least one.
|
||||
This will produce a tree that looks ahead one bit and then advances the
|
||||
stream zero bits.*/
|
||||
nbits=1;
|
||||
occupancy=2;
|
||||
got_leaves=1;
|
||||
do{
|
||||
int ti;
|
||||
if(got_leaves)best_nbits=nbits;
|
||||
nbits++;
|
||||
got_leaves=0;
|
||||
loccupancy=occupancy;
|
||||
for(occupancy=ti=0;ti<_ntokens;occupancy++){
|
||||
if(_tokens[ti][1]<_depth+nbits)ti++;
|
||||
else if(_tokens[ti][1]==_depth+nbits){
|
||||
got_leaves=1;
|
||||
ti++;
|
||||
}
|
||||
else ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+nbits);
|
||||
}
|
||||
}
|
||||
while(occupancy>loccupancy&&occupancy*slush>=1<<nbits);
|
||||
return best_nbits;
|
||||
}
|
||||
|
||||
/*Determines the size in words of a Huffman tree node that represents a
|
||||
subtree of depth _nbits.
|
||||
_nbits: The depth of the subtree.
|
||||
If this is 0, the node is a leaf node.
|
||||
Otherwise 1<<_nbits pointers are allocated for children.
|
||||
Return: The number of bytes required to store the node.*/
|
||||
This must be greater than zero.
|
||||
Return: The number of words required to store the node.*/
|
||||
static size_t oc_huff_node_size(int _nbits){
|
||||
size_t size;
|
||||
size=_ogg_offsetof(oc_huff_node,nodes);
|
||||
if(_nbits>0)size+=sizeof(oc_huff_node *)*(1<<_nbits);
|
||||
return size;
|
||||
return 1+(1<<_nbits);
|
||||
}
|
||||
|
||||
static oc_huff_node *oc_huff_node_init(char **_storage,size_t _size,int _nbits){
|
||||
oc_huff_node *ret;
|
||||
ret=(oc_huff_node *)*_storage;
|
||||
ret->nbits=(unsigned char)_nbits;
|
||||
(*_storage)+=_size;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*Determines the size in bytes of a Huffman tree.
|
||||
_nbits: The depth of the subtree.
|
||||
If this is 0, the node is a leaf node.
|
||||
Otherwise storage for 1<<_nbits pointers are added for children.
|
||||
Return: The number of bytes required to store the tree.*/
|
||||
static size_t oc_huff_tree_size(const oc_huff_node *_node){
|
||||
size_t size;
|
||||
size=oc_huff_node_size(_node->nbits);
|
||||
if(_node->nbits){
|
||||
int nchildren;
|
||||
int i;
|
||||
nchildren=1<<_node->nbits;
|
||||
for(i=0;i<nchildren;i+=1<<_node->nbits-_node->nodes[i]->depth){
|
||||
size+=oc_huff_tree_size(_node->nodes[i]);
|
||||
}
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
|
||||
/*Unpacks a sub-tree from the given buffer.
|
||||
_opb: The buffer to unpack from.
|
||||
_binodes: The nodes to store the sub-tree in.
|
||||
_nbinodes: The number of nodes available for the sub-tree.
|
||||
Return: 0 on success, or a negative value on error.*/
|
||||
static int oc_huff_tree_unpack(oc_pack_buf *_opb,
|
||||
oc_huff_node *_binodes,int _nbinodes){
|
||||
oc_huff_node *binode;
|
||||
long bits;
|
||||
int nused;
|
||||
if(_nbinodes<1)return TH_EBADHEADER;
|
||||
binode=_binodes;
|
||||
nused=0;
|
||||
bits=oc_pack_read1(_opb);
|
||||
if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
|
||||
/*Read an internal node:*/
|
||||
if(!bits){
|
||||
int ret;
|
||||
nused++;
|
||||
binode->nbits=1;
|
||||
binode->depth=1;
|
||||
binode->nodes[0]=_binodes+nused;
|
||||
ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
|
||||
if(ret>=0){
|
||||
nused+=ret;
|
||||
binode->nodes[1]=_binodes+nused;
|
||||
ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
|
||||
}
|
||||
if(ret<0)return ret;
|
||||
nused+=ret;
|
||||
}
|
||||
/*Read a leaf node:*/
|
||||
else{
|
||||
int ntokens;
|
||||
int token;
|
||||
int i;
|
||||
bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
|
||||
if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
|
||||
/*Find out how many internal tokens we translate this external token into.*/
|
||||
ntokens=OC_DCT_TOKEN_MAP_ENTRIES[bits];
|
||||
if(_nbinodes<2*ntokens-1)return TH_EBADHEADER;
|
||||
/*Fill in a complete binary tree pointing to the internal tokens.*/
|
||||
for(i=1;i<ntokens;i<<=1){
|
||||
int j;
|
||||
binode=_binodes+nused;
|
||||
nused+=i;
|
||||
for(j=0;j<i;j++){
|
||||
binode[j].nbits=1;
|
||||
binode[j].depth=1;
|
||||
binode[j].nodes[0]=_binodes+nused+2*j;
|
||||
binode[j].nodes[1]=_binodes+nused+2*j+1;
|
||||
/*Produces a collapsed-tree representation of the given token list.
|
||||
_tree: The storage for the collapsed Huffman tree.
|
||||
This may be NULL to compute the required storage size instead of
|
||||
constructing the tree.
|
||||
_tokens: A list of internal tokens, in the order they are found in the
|
||||
codebook, and the lengths of their corresponding codewords.
|
||||
_ntokens: The number of tokens corresponding to this tree node.
|
||||
Return: The number of words required to store the tree.*/
|
||||
static size_t oc_huff_tree_collapse(ogg_int16_t *_tree,
|
||||
unsigned char _tokens[][2],int _ntokens){
|
||||
ogg_int16_t node[34];
|
||||
unsigned char depth[34];
|
||||
unsigned char last[34];
|
||||
size_t ntree;
|
||||
int ti;
|
||||
int l;
|
||||
depth[0]=0;
|
||||
last[0]=(unsigned char)(_ntokens-1);
|
||||
ntree=0;
|
||||
ti=0;
|
||||
l=0;
|
||||
do{
|
||||
int nbits;
|
||||
nbits=oc_huff_tree_collapse_depth(_tokens+ti,last[l]+1-ti,depth[l]);
|
||||
node[l]=(ogg_int16_t)ntree;
|
||||
ntree+=oc_huff_node_size(nbits);
|
||||
if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)nbits;
|
||||
do{
|
||||
while(ti<=last[l]&&_tokens[ti][1]<=depth[l]+nbits){
|
||||
if(_tree!=NULL){
|
||||
ogg_int16_t leaf;
|
||||
int nentries;
|
||||
nentries=1<<depth[l]+nbits-_tokens[ti][1];
|
||||
leaf=(ogg_int16_t)-(_tokens[ti][1]-depth[l]<<8|_tokens[ti][0]);
|
||||
while(nentries-->0)_tree[node[l]++]=leaf;
|
||||
}
|
||||
ti++;
|
||||
}
|
||||
if(ti<=last[l]){
|
||||
/*We need to recurse*/
|
||||
depth[l+1]=(unsigned char)(depth[l]+nbits);
|
||||
if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)ntree;
|
||||
l++;
|
||||
last[l]=
|
||||
(unsigned char)(ti+oc_huff_subtree_tokens(_tokens+ti,depth[l])-1);
|
||||
break;
|
||||
}
|
||||
/*Pop back up a level of recursion.*/
|
||||
else if(l-->0)nbits=depth[l+1]-depth[l];
|
||||
}
|
||||
/*And now the leaf nodes with those tokens.*/
|
||||
token=OC_DCT_TOKEN_MAP[bits];
|
||||
for(i=0;i<ntokens;i++){
|
||||
binode=_binodes+nused++;
|
||||
binode->nbits=0;
|
||||
binode->depth=1;
|
||||
binode->token=token+i;
|
||||
}
|
||||
while(l>=0);
|
||||
}
|
||||
return nused;
|
||||
}
|
||||
|
||||
/*Finds the depth of shortest branch of the given sub-tree.
|
||||
The tree must be binary.
|
||||
_binode: The root of the given sub-tree.
|
||||
_binode->nbits must be 0 or 1.
|
||||
Return: The smallest depth of a leaf node in this sub-tree.
|
||||
0 indicates this sub-tree is a leaf node.*/
|
||||
static int oc_huff_tree_mindepth(oc_huff_node *_binode){
|
||||
int depth0;
|
||||
int depth1;
|
||||
if(_binode->nbits==0)return 0;
|
||||
depth0=oc_huff_tree_mindepth(_binode->nodes[0]);
|
||||
depth1=oc_huff_tree_mindepth(_binode->nodes[1]);
|
||||
return OC_MINI(depth0,depth1)+1;
|
||||
}
|
||||
|
||||
/*Finds the number of internal nodes at a given depth, plus the number of
|
||||
leaves at that depth or shallower.
|
||||
The tree must be binary.
|
||||
_binode: The root of the given sub-tree.
|
||||
_binode->nbits must be 0 or 1.
|
||||
Return: The number of entries that would be contained in a jump table of the
|
||||
given depth.*/
|
||||
static int oc_huff_tree_occupancy(oc_huff_node *_binode,int _depth){
|
||||
if(_binode->nbits==0||_depth<=0)return 1;
|
||||
else{
|
||||
return oc_huff_tree_occupancy(_binode->nodes[0],_depth-1)+
|
||||
oc_huff_tree_occupancy(_binode->nodes[1],_depth-1);
|
||||
}
|
||||
}
|
||||
|
||||
/*Makes a copy of the given Huffman tree.
|
||||
_node: The Huffman tree to copy.
|
||||
Return: The copy of the Huffman tree.*/
|
||||
static oc_huff_node *oc_huff_tree_copy(const oc_huff_node *_node,
|
||||
char **_storage){
|
||||
oc_huff_node *ret;
|
||||
ret=oc_huff_node_init(_storage,oc_huff_node_size(_node->nbits),_node->nbits);
|
||||
ret->depth=_node->depth;
|
||||
if(_node->nbits){
|
||||
int nchildren;
|
||||
int i;
|
||||
int inext;
|
||||
nchildren=1<<_node->nbits;
|
||||
for(i=0;i<nchildren;){
|
||||
ret->nodes[i]=oc_huff_tree_copy(_node->nodes[i],_storage);
|
||||
inext=i+(1<<_node->nbits-ret->nodes[i]->depth);
|
||||
while(++i<inext)ret->nodes[i]=ret->nodes[i-1];
|
||||
}
|
||||
}
|
||||
else ret->token=_node->token;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static size_t oc_huff_tree_collapse_size(oc_huff_node *_binode,int _depth){
|
||||
size_t size;
|
||||
int mindepth;
|
||||
int depth;
|
||||
int loccupancy;
|
||||
int occupancy;
|
||||
if(_binode->nbits!=0&&_depth>0){
|
||||
return oc_huff_tree_collapse_size(_binode->nodes[0],_depth-1)+
|
||||
oc_huff_tree_collapse_size(_binode->nodes[1],_depth-1);
|
||||
}
|
||||
depth=mindepth=oc_huff_tree_mindepth(_binode);
|
||||
occupancy=1<<mindepth;
|
||||
do{
|
||||
loccupancy=occupancy;
|
||||
occupancy=oc_huff_tree_occupancy(_binode,++depth);
|
||||
}
|
||||
while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
|
||||
depth--;
|
||||
size=oc_huff_node_size(depth);
|
||||
if(depth>0){
|
||||
size+=oc_huff_tree_collapse_size(_binode->nodes[0],depth-1);
|
||||
size+=oc_huff_tree_collapse_size(_binode->nodes[1],depth-1);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
|
||||
char **_storage);
|
||||
|
||||
/*Fills the given nodes table with all the children in the sub-tree at the
|
||||
given depth.
|
||||
The nodes in the sub-tree with a depth less than that stored in the table
|
||||
are freed.
|
||||
The sub-tree must be binary and complete up until the given depth.
|
||||
_nodes: The nodes table to fill.
|
||||
_binode: The root of the sub-tree to fill it with.
|
||||
_binode->nbits must be 0 or 1.
|
||||
_level: The current level in the table.
|
||||
0 indicates that the current node should be stored, regardless of
|
||||
whether it is a leaf node or an internal node.
|
||||
_depth: The depth of the nodes to fill the table with, relative to their
|
||||
parent.*/
|
||||
static void oc_huff_node_fill(oc_huff_node **_nodes,
|
||||
oc_huff_node *_binode,int _level,int _depth,char **_storage){
|
||||
if(_level<=0||_binode->nbits==0){
|
||||
int i;
|
||||
_binode->depth=(unsigned char)(_depth-_level);
|
||||
_nodes[0]=oc_huff_tree_collapse(_binode,_storage);
|
||||
for(i=1;i<1<<_level;i++)_nodes[i]=_nodes[0];
|
||||
}
|
||||
else{
|
||||
_level--;
|
||||
oc_huff_node_fill(_nodes,_binode->nodes[0],_level,_depth,_storage);
|
||||
_nodes+=1<<_level;
|
||||
oc_huff_node_fill(_nodes,_binode->nodes[1],_level,_depth,_storage);
|
||||
}
|
||||
}
|
||||
|
||||
/*Finds the largest complete sub-tree rooted at the current node and collapses
|
||||
it into a single node.
|
||||
This procedure is then applied recursively to all the children of that node.
|
||||
_binode: The root of the sub-tree to collapse.
|
||||
_binode->nbits must be 0 or 1.
|
||||
Return: The new root of the collapsed sub-tree.*/
|
||||
static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
|
||||
char **_storage){
|
||||
oc_huff_node *root;
|
||||
size_t size;
|
||||
int mindepth;
|
||||
int depth;
|
||||
int loccupancy;
|
||||
int occupancy;
|
||||
depth=mindepth=oc_huff_tree_mindepth(_binode);
|
||||
occupancy=1<<mindepth;
|
||||
do{
|
||||
loccupancy=occupancy;
|
||||
occupancy=oc_huff_tree_occupancy(_binode,++depth);
|
||||
}
|
||||
while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
|
||||
depth--;
|
||||
if(depth<=0)return oc_huff_tree_copy(_binode,_storage);
|
||||
size=oc_huff_node_size(depth);
|
||||
root=oc_huff_node_init(_storage,size,depth);
|
||||
root->depth=_binode->depth;
|
||||
oc_huff_node_fill(root->nodes,_binode,depth,depth,_storage);
|
||||
return root;
|
||||
while(l>=0);
|
||||
return ntree;
|
||||
}
|
||||
|
||||
/*Unpacks a set of Huffman trees, and reduces them to a collapsed
|
||||
representation.
|
||||
_opb: The buffer to unpack the trees from.
|
||||
_nodes: The table to fill with the Huffman trees.
|
||||
Return: 0 on success, or a negative value on error.*/
|
||||
Return: 0 on success, or a negative value on error.
|
||||
The caller is responsible for cleaning up any partially initialized
|
||||
_nodes on failure.*/
|
||||
int oc_huff_trees_unpack(oc_pack_buf *_opb,
|
||||
oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
|
||||
ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
|
||||
int i;
|
||||
for(i=0;i<TH_NHUFFMAN_TABLES;i++){
|
||||
oc_huff_node nodes[511];
|
||||
char *storage;
|
||||
size_t size;
|
||||
int ret;
|
||||
unsigned char tokens[256][2];
|
||||
int ntokens;
|
||||
ogg_int16_t *tree;
|
||||
size_t size;
|
||||
/*Unpack the full tree into a temporary buffer.*/
|
||||
ret=oc_huff_tree_unpack(_opb,nodes,sizeof(nodes)/sizeof(*nodes));
|
||||
if(ret<0)return ret;
|
||||
/*Figure out how big the collapsed tree will be.*/
|
||||
size=oc_huff_tree_collapse_size(nodes,0);
|
||||
storage=(char *)_ogg_calloc(1,size);
|
||||
if(storage==NULL)return TH_EFAULT;
|
||||
/*And collapse it.*/
|
||||
_nodes[i]=oc_huff_tree_collapse(nodes,&storage);
|
||||
ntokens=oc_huff_tree_unpack(_opb,tokens);
|
||||
if(ntokens<0)return ntokens;
|
||||
/*Figure out how big the collapsed tree will be and allocate space for it.*/
|
||||
size=oc_huff_tree_collapse(NULL,tokens,ntokens);
|
||||
/*This should never happen; if it does it means you set OC_HUFF_SLUSH or
|
||||
OC_ROOT_HUFF_SLUSH too large.*/
|
||||
if(size>32767)return TH_EIMPL;
|
||||
tree=(ogg_int16_t *)_ogg_malloc(size*sizeof(*tree));
|
||||
if(tree==NULL)return TH_EFAULT;
|
||||
/*Construct the collapsed the tree.*/
|
||||
oc_huff_tree_collapse(tree,tokens,ntokens);
|
||||
_nodes[i]=tree;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*Determines the size in words of a Huffman subtree.
|
||||
_tree: The complete Huffman tree.
|
||||
_node: The index of the root of the desired subtree.
|
||||
Return: The number of words required to store the tree.*/
|
||||
static size_t oc_huff_tree_size(const ogg_int16_t *_tree,int _node){
|
||||
size_t size;
|
||||
int nchildren;
|
||||
int n;
|
||||
int i;
|
||||
n=_tree[_node];
|
||||
size=oc_huff_node_size(n);
|
||||
nchildren=1<<n;
|
||||
i=0;
|
||||
do{
|
||||
int child;
|
||||
child=_tree[_node+i+1];
|
||||
if(child<=0)i+=1<<n-(-child>>8);
|
||||
else{
|
||||
size+=oc_huff_tree_size(_tree,child);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
while(i<nchildren);
|
||||
return size;
|
||||
}
|
||||
|
||||
/*Makes a copy of the given set of Huffman trees.
|
||||
_dst: The array to store the copy in.
|
||||
_src: The array of trees to copy.*/
|
||||
int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
|
||||
const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]){
|
||||
int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
|
||||
const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]){
|
||||
int total;
|
||||
int i;
|
||||
total=0;
|
||||
for(i=0;i<TH_NHUFFMAN_TABLES;i++){
|
||||
size_t size;
|
||||
char *storage;
|
||||
size=oc_huff_tree_size(_src[i]);
|
||||
storage=(char *)_ogg_calloc(1,size);
|
||||
if(storage==NULL){
|
||||
size_t size;
|
||||
size=oc_huff_tree_size(_src[i],0);
|
||||
total+=size;
|
||||
_dst[i]=(ogg_int16_t *)_ogg_malloc(size*sizeof(*_dst[i]));
|
||||
if(_dst[i]==NULL){
|
||||
while(i-->0)_ogg_free(_dst[i]);
|
||||
return TH_EFAULT;
|
||||
}
|
||||
_dst[i]=oc_huff_tree_copy(_src[i],&storage);
|
||||
memcpy(_dst[i],_src[i],size*sizeof(*_dst[i]));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*Frees the memory used by a set of Huffman trees.
|
||||
_nodes: The array of trees to free.*/
|
||||
void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
|
||||
void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
|
||||
int i;
|
||||
for(i=0;i<TH_NHUFFMAN_TABLES;i++)_ogg_free(_nodes[i]);
|
||||
}
|
||||
|
||||
|
||||
/*Unpacks a single token using the given Huffman tree.
|
||||
_opb: The buffer to unpack the token from.
|
||||
_node: The tree to unpack the token with.
|
||||
Return: The token value.*/
|
||||
int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node){
|
||||
long bits;
|
||||
while(_node->nbits!=0){
|
||||
bits=oc_pack_look(_opb,_node->nbits);
|
||||
_node=_node->nodes[bits];
|
||||
oc_pack_adv(_opb,_node->depth);
|
||||
int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_tree){
|
||||
const unsigned char *ptr;
|
||||
const unsigned char *stop;
|
||||
oc_pb_window window;
|
||||
int available;
|
||||
long bits;
|
||||
int node;
|
||||
int n;
|
||||
ptr=_opb->ptr;
|
||||
window=_opb->window;
|
||||
stop=_opb->stop;
|
||||
available=_opb->bits;
|
||||
node=0;
|
||||
for(;;){
|
||||
n=_tree[node];
|
||||
if(n>available){
|
||||
unsigned shift;
|
||||
shift=OC_PB_WINDOW_SIZE-available;
|
||||
do{
|
||||
/*We don't bother setting eof because we won't check for it after we've
|
||||
started decoding DCT tokens.*/
|
||||
if(ptr>=stop){
|
||||
shift=(unsigned)-OC_LOTS_OF_BITS;
|
||||
break;
|
||||
}
|
||||
shift-=8;
|
||||
window|=(oc_pb_window)*ptr++<<shift;
|
||||
}
|
||||
while(shift>=8);
|
||||
/*Note: We never request more than 24 bits, so there's no need to fill in
|
||||
the last partial byte here.*/
|
||||
available=OC_PB_WINDOW_SIZE-shift;
|
||||
}
|
||||
bits=window>>OC_PB_WINDOW_SIZE-n;
|
||||
node=_tree[node+1+bits];
|
||||
if(node<=0)break;
|
||||
window<<=n;
|
||||
available-=n;
|
||||
}
|
||||
return _node->token;
|
||||
node=-node;
|
||||
n=node>>8;
|
||||
window<<=n;
|
||||
available-=n;
|
||||
_opb->ptr=ptr;
|
||||
_opb->window=window;
|
||||
_opb->bits=available;
|
||||
return node&255;
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: huffdec.h 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: huffdec.h 17410 2010-09-21 21:53:48Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -22,71 +22,11 @@
|
||||
|
||||
|
||||
|
||||
typedef struct oc_huff_node oc_huff_node;
|
||||
|
||||
/*A node in the Huffman tree.
|
||||
Instead of storing every branching in the tree, subtrees can be collapsed
|
||||
into one node, with a table of size 1<<nbits pointing directly to its
|
||||
descedents nbits levels down.
|
||||
This allows more than one bit to be read at a time, and avoids following all
|
||||
the intermediate branches with next to no increased code complexity once
|
||||
the collapsed tree has been built.
|
||||
We do _not_ require that a subtree be complete to be collapsed, but instead
|
||||
store duplicate pointers in the table, and record the actual depth of the
|
||||
node below its parent.
|
||||
This tells us the number of bits to advance the stream after reaching it.
|
||||
|
||||
This turns out to be equivalent to the method described in \cite{Hash95},
|
||||
without the requirement that codewords be sorted by length.
|
||||
If the codewords were sorted by length (so-called ``canonical-codes''), they
|
||||
could be decoded much faster via either Lindell and Moffat's approach or
|
||||
Hashemian's Condensed Huffman Code approach, the latter of which has an
|
||||
extremely small memory footprint.
|
||||
We can't use Choueka et al.'s finite state machine approach, which is
|
||||
extremely fast, because we can't allow multiple symbols to be output at a
|
||||
time; the codebook can and does change between symbols.
|
||||
It also has very large memory requirements, which impairs cache coherency.
|
||||
|
||||
@ARTICLE{Hash95,
|
||||
author="Reza Hashemian",
|
||||
title="Memory Efficient and High-Speed Search {Huffman} Coding",
|
||||
journal="{IEEE} Transactions on Communications",
|
||||
volume=43,
|
||||
number=10,
|
||||
pages="2576--2581",
|
||||
month=Oct,
|
||||
year=1995
|
||||
}*/
|
||||
struct oc_huff_node{
|
||||
/*The number of bits of the code needed to descend through this node.
|
||||
0 indicates a leaf node.
|
||||
Otherwise there are 1<<nbits nodes in the nodes table, which can be
|
||||
indexed by reading nbits bits from the stream.*/
|
||||
unsigned char nbits;
|
||||
/*The value of a token stored in a leaf node.
|
||||
The value in non-leaf nodes is undefined.*/
|
||||
unsigned char token;
|
||||
/*The depth of the current node, relative to its parent in the collapsed
|
||||
tree.
|
||||
This can be less than its parent's nbits value, in which case there are
|
||||
1<<nbits-depth copies of this node in the table, and the bitstream should
|
||||
only be advanced depth bits after reaching this node.*/
|
||||
unsigned char depth;
|
||||
/*The table of child nodes.
|
||||
The ACTUAL size of this array is 1<<nbits, despite what the declaration
|
||||
below claims.
|
||||
The exception is that for leaf nodes the size is 0.*/
|
||||
oc_huff_node *nodes[2];
|
||||
};
|
||||
|
||||
|
||||
|
||||
int oc_huff_trees_unpack(oc_pack_buf *_opb,
|
||||
oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
|
||||
int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
|
||||
const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]);
|
||||
void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
|
||||
int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node);
|
||||
|
||||
ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
|
||||
int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
|
||||
const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]);
|
||||
void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
|
||||
int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_node);
|
||||
|
||||
#endif
|
||||
|
@ -1,19 +0,0 @@
|
||||
#if !defined(_huffenc_H)
|
||||
# define _huffenc_H (1)
|
||||
# include "huffman.h"
|
||||
|
||||
|
||||
|
||||
typedef th_huff_code th_huff_table[TH_NDCT_TOKENS];
|
||||
|
||||
|
||||
|
||||
extern const th_huff_code
|
||||
TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
|
||||
|
||||
|
||||
|
||||
int oc_huff_codes_pack(oggpack_buffer *_opb,
|
||||
const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]);
|
||||
|
||||
#endif
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: idct.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: idct.c 17410 2010-09-21 21:53:48Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -231,18 +231,18 @@ static void idct8_1(ogg_int16_t *_y,const ogg_int16_t _x[1]){
|
||||
_y: The buffer to store the result in.
|
||||
This may be the same as _x.
|
||||
_x: The input coefficients.*/
|
||||
static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
const ogg_int16_t *in;
|
||||
ogg_int16_t *end;
|
||||
ogg_int16_t *out;
|
||||
ogg_int16_t w[64];
|
||||
static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
ogg_int16_t w[64];
|
||||
int i;
|
||||
/*Transform rows of x into columns of w.*/
|
||||
idct8_2(w,_x);
|
||||
idct8_1(w+1,_x+8);
|
||||
/*Transform rows of w into columns of y.*/
|
||||
for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_2(out,in);
|
||||
for(i=0;i<8;i++)idct8_2(_y+i,w+i*8);
|
||||
/*Adjust for the scale factor.*/
|
||||
for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
|
||||
for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
|
||||
/*Clear input data for next block (decoder only).*/
|
||||
if(_x!=_y)_x[0]=_x[1]=_x[8]=0;
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
@ -260,20 +260,20 @@ static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
_y: The buffer to store the result in.
|
||||
This may be the same as _x.
|
||||
_x: The input coefficients.*/
|
||||
static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
const ogg_int16_t *in;
|
||||
ogg_int16_t *end;
|
||||
ogg_int16_t *out;
|
||||
ogg_int16_t w[64];
|
||||
static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
ogg_int16_t w[64];
|
||||
int i;
|
||||
/*Transform rows of x into columns of w.*/
|
||||
idct8_4(w,_x);
|
||||
idct8_3(w+1,_x+8);
|
||||
idct8_2(w+2,_x+16);
|
||||
idct8_1(w+3,_x+24);
|
||||
/*Transform rows of w into columns of y.*/
|
||||
for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_4(out,in);
|
||||
for(i=0;i<8;i++)idct8_4(_y+i,w+i*8);
|
||||
/*Adjust for the scale factor.*/
|
||||
for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
|
||||
for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
|
||||
/*Clear input data for next block (decoder only).*/
|
||||
if(_x!=_y)_x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
@ -282,28 +282,22 @@ static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
_y: The buffer to store the result in.
|
||||
This may be the same as _x.
|
||||
_x: The input coefficients.*/
|
||||
static void oc_idct8x8_slow(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
const ogg_int16_t *in;
|
||||
ogg_int16_t *end;
|
||||
ogg_int16_t *out;
|
||||
ogg_int16_t w[64];
|
||||
static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
ogg_int16_t w[64];
|
||||
int i;
|
||||
/*Transform rows of x into columns of w.*/
|
||||
for(in=_x,out=w,end=out+8;out<end;in+=8,out++)idct8(out,in);
|
||||
for(i=0;i<8;i++)idct8(w+i,_x+i*8);
|
||||
/*Transform rows of w into columns of y.*/
|
||||
for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8(out,in);
|
||||
for(i=0;i<8;i++)idct8(_y+i,w+i*8);
|
||||
/*Adjust for the scale factor.*/
|
||||
for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
|
||||
}
|
||||
|
||||
void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
|
||||
int _last_zzi){
|
||||
(*_state->opt_vtable.idct8x8)(_y,_last_zzi);
|
||||
for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
|
||||
if(_x!=_y)for(i=0;i<64;i++)_x[i]=0;
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.*/
|
||||
void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
|
||||
void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||
/*_last_zzi is subtly different from an actual count of the number of
|
||||
coefficients we decoded for this block.
|
||||
It contains the value of zzi BEFORE the final token in the block was
|
||||
@ -329,7 +323,7 @@ void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
|
||||
gets.
|
||||
Needless to say we inherited this approach from VP3.*/
|
||||
/*Then perform the iDCT.*/
|
||||
if(_last_zzi<3)oc_idct8x8_3(_y,_y);
|
||||
else if(_last_zzi<10)oc_idct8x8_10(_y,_y);
|
||||
else oc_idct8x8_slow(_y,_y);
|
||||
if(_last_zzi<=3)oc_idct8x8_3(_y,_x);
|
||||
else if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
|
||||
else oc_idct8x8_slow(_y,_x);
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: internal.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: internal.c 17506 2010-10-13 02:52:41Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -97,79 +97,29 @@ int oc_ilog(unsigned _v){
|
||||
|
||||
|
||||
|
||||
/*The function used to fill in the chroma plane motion vectors for a macro
|
||||
block when 4 different motion vectors are specified in the luma plane.
|
||||
This version is for use with chroma decimated in the X and Y directions
|
||||
(4:2:0).
|
||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||
_lbmvs: The luma block-level motion vectors.*/
|
||||
static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
||||
int dx;
|
||||
int dy;
|
||||
dx=_lbmvs[0][0]+_lbmvs[1][0]+_lbmvs[2][0]+_lbmvs[3][0];
|
||||
dy=_lbmvs[0][1]+_lbmvs[1][1]+_lbmvs[2][1]+_lbmvs[3][1];
|
||||
_cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,2,2);
|
||||
_cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,2,2);
|
||||
void *oc_aligned_malloc(size_t _sz,size_t _align){
|
||||
unsigned char *p;
|
||||
if(_align-1>UCHAR_MAX||(_align&_align-1)||_sz>~(size_t)0-_align)return NULL;
|
||||
p=(unsigned char *)_ogg_malloc(_sz+_align);
|
||||
if(p!=NULL){
|
||||
int offs;
|
||||
offs=((p-(unsigned char *)0)-1&_align-1);
|
||||
p[offs]=offs;
|
||||
p+=offs+1;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
/*The function used to fill in the chroma plane motion vectors for a macro
|
||||
block when 4 different motion vectors are specified in the luma plane.
|
||||
This version is for use with chroma decimated in the Y direction.
|
||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||
_lbmvs: The luma block-level motion vectors.*/
|
||||
static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
||||
int dx;
|
||||
int dy;
|
||||
dx=_lbmvs[0][0]+_lbmvs[2][0];
|
||||
dy=_lbmvs[0][1]+_lbmvs[2][1];
|
||||
_cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
|
||||
_cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
|
||||
dx=_lbmvs[1][0]+_lbmvs[3][0];
|
||||
dy=_lbmvs[1][1]+_lbmvs[3][1];
|
||||
_cbmvs[1][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
|
||||
_cbmvs[1][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
|
||||
void oc_aligned_free(void *_ptr){
|
||||
unsigned char *p;
|
||||
p=(unsigned char *)_ptr;
|
||||
if(p!=NULL){
|
||||
int offs;
|
||||
offs=*--p;
|
||||
_ogg_free(p-offs);
|
||||
}
|
||||
}
|
||||
|
||||
/*The function used to fill in the chroma plane motion vectors for a macro
|
||||
block when 4 different motion vectors are specified in the luma plane.
|
||||
This version is for use with chroma decimated in the X direction (4:2:2).
|
||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||
_lbmvs: The luma block-level motion vectors.*/
|
||||
static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
||||
int dx;
|
||||
int dy;
|
||||
dx=_lbmvs[0][0]+_lbmvs[1][0];
|
||||
dy=_lbmvs[0][1]+_lbmvs[1][1];
|
||||
_cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
|
||||
_cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
|
||||
dx=_lbmvs[2][0]+_lbmvs[3][0];
|
||||
dy=_lbmvs[2][1]+_lbmvs[3][1];
|
||||
_cbmvs[2][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
|
||||
_cbmvs[2][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
|
||||
}
|
||||
|
||||
/*The function used to fill in the chroma plane motion vectors for a macro
|
||||
block when 4 different motion vectors are specified in the luma plane.
|
||||
This version is for use with no chroma decimation (4:4:4).
|
||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||
_lmbmv: The luma macro-block level motion vector to fill in for use in
|
||||
prediction.
|
||||
_lbmvs: The luma block-level motion vectors.*/
|
||||
static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
||||
memcpy(_cbmvs,_lbmvs,4*sizeof(_lbmvs[0]));
|
||||
}
|
||||
|
||||
/*A table of functions used to fill in the chroma plane motion vectors for a
|
||||
macro block when 4 different motion vectors are specified in the luma
|
||||
plane.*/
|
||||
const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]={
|
||||
(oc_set_chroma_mvs_func)oc_set_chroma_mvs00,
|
||||
(oc_set_chroma_mvs_func)oc_set_chroma_mvs01,
|
||||
(oc_set_chroma_mvs_func)oc_set_chroma_mvs10,
|
||||
(oc_set_chroma_mvs_func)oc_set_chroma_mvs11
|
||||
};
|
||||
|
||||
|
||||
|
||||
void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){
|
||||
size_t rowsz;
|
||||
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: internal.h 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: internal.h 17578 2010-10-29 04:21:26Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
#if !defined(_internal_H)
|
||||
@ -19,10 +19,20 @@
|
||||
# include <stdlib.h>
|
||||
# include <limits.h>
|
||||
# if defined(HAVE_CONFIG_H)
|
||||
# include <config.h>
|
||||
# include "config.h"
|
||||
# endif
|
||||
# include "theora/codec.h"
|
||||
# include "theora/theora.h"
|
||||
# include "ocintrin.h"
|
||||
|
||||
# if !defined(__GNUC_PREREQ)
|
||||
# if defined(__GNUC__)&&defined(__GNUC_MINOR__)
|
||||
# define __GNUC_PREREQ(_maj,_min) \
|
||||
((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min))
|
||||
# else
|
||||
# define __GNUC_PREREQ(_maj,_min) 0
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# if defined(_MSC_VER)
|
||||
/*Disable missing EMMS warnings.*/
|
||||
@ -31,24 +41,25 @@
|
||||
# pragma warning(disable:4554)
|
||||
# endif
|
||||
/*You, too, gcc.*/
|
||||
# if defined(__GNUC_PREREQ)
|
||||
# if __GNUC_PREREQ(4,2)
|
||||
# pragma GCC diagnostic ignored "-Wparentheses"
|
||||
# endif
|
||||
# if __GNUC_PREREQ(4,2)
|
||||
# pragma GCC diagnostic ignored "-Wparentheses"
|
||||
# endif
|
||||
|
||||
# include "ocintrin.h"
|
||||
# include "huffman.h"
|
||||
# include "quant.h"
|
||||
|
||||
/*Some assembly constructs require aligned operands.*/
|
||||
# if defined(OC_X86_ASM)
|
||||
/*Some assembly constructs require aligned operands.
|
||||
The following macros are _only_ intended for structure member declarations.
|
||||
Although they will sometimes work on stack variables, gcc will often silently
|
||||
ignore them.
|
||||
A separate set of macros could be made for manual stack alignment, but we
|
||||
don't actually require it anywhere.*/
|
||||
# if defined(OC_X86_ASM)||defined(OC_ARM_ASM)
|
||||
# if defined(__GNUC__)
|
||||
# define OC_ALIGN8(expr) expr __attribute__((aligned(8)))
|
||||
# define OC_ALIGN16(expr) expr __attribute__((aligned(16)))
|
||||
# elif defined(_MSC_VER)
|
||||
# define OC_ALIGN8(expr) __declspec (align(8)) expr
|
||||
# define OC_ALIGN16(expr) __declspec (align(16)) expr
|
||||
# else
|
||||
# error "Alignment macros required for this platform."
|
||||
# endif
|
||||
# endif
|
||||
# if !defined(OC_ALIGN8)
|
||||
@ -60,19 +71,8 @@
|
||||
|
||||
|
||||
|
||||
typedef struct oc_sb_flags oc_sb_flags;
|
||||
typedef struct oc_border_info oc_border_info;
|
||||
typedef struct oc_fragment oc_fragment;
|
||||
typedef struct oc_fragment_plane oc_fragment_plane;
|
||||
typedef struct oc_base_opt_vtable oc_base_opt_vtable;
|
||||
typedef struct oc_base_opt_data oc_base_opt_data;
|
||||
typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
|
||||
typedef struct oc_theora_state oc_theora_state;
|
||||
|
||||
|
||||
|
||||
/*This library's version.*/
|
||||
# define OC_VENDOR_STRING "Xiph.Org libtheora 1.1 20090822 (Thusnelda)"
|
||||
# define OC_VENDOR_STRING "Xiph.Org libtheora 1.2.0alpha 20100924 (Ptalarbvorm)"
|
||||
|
||||
/*Theora bitstream version.*/
|
||||
# define TH_VERSION_MAJOR (3)
|
||||
@ -83,315 +83,6 @@ typedef struct oc_theora_state oc_theora_state;
|
||||
((_info)->version_minor>(_min)||(_info)->version_minor==(_min)&& \
|
||||
(_info)->version_subminor>=(_sub)))
|
||||
|
||||
/*A keyframe.*/
|
||||
#define OC_INTRA_FRAME (0)
|
||||
/*A predicted frame.*/
|
||||
#define OC_INTER_FRAME (1)
|
||||
/*A frame of unknown type (frame type decision has not yet been made).*/
|
||||
#define OC_UNKWN_FRAME (-1)
|
||||
|
||||
/*The amount of padding to add to the reconstructed frame buffers on all
|
||||
sides.
|
||||
This is used to allow unrestricted motion vectors without special casing.
|
||||
This must be a multiple of 2.*/
|
||||
#define OC_UMV_PADDING (16)
|
||||
|
||||
/*Frame classification indices.*/
|
||||
/*The previous golden frame.*/
|
||||
#define OC_FRAME_GOLD (0)
|
||||
/*The previous frame.*/
|
||||
#define OC_FRAME_PREV (1)
|
||||
/*The current frame.*/
|
||||
#define OC_FRAME_SELF (2)
|
||||
|
||||
/*The input or output buffer.*/
|
||||
#define OC_FRAME_IO (3)
|
||||
|
||||
/*Macroblock modes.*/
|
||||
/*Macro block is invalid: It is never coded.*/
|
||||
#define OC_MODE_INVALID (-1)
|
||||
/*Encoded difference from the same macro block in the previous frame.*/
|
||||
#define OC_MODE_INTER_NOMV (0)
|
||||
/*Encoded with no motion compensated prediction.*/
|
||||
#define OC_MODE_INTRA (1)
|
||||
/*Encoded difference from the previous frame offset by the given motion
|
||||
vector.*/
|
||||
#define OC_MODE_INTER_MV (2)
|
||||
/*Encoded difference from the previous frame offset by the last coded motion
|
||||
vector.*/
|
||||
#define OC_MODE_INTER_MV_LAST (3)
|
||||
/*Encoded difference from the previous frame offset by the second to last
|
||||
coded motion vector.*/
|
||||
#define OC_MODE_INTER_MV_LAST2 (4)
|
||||
/*Encoded difference from the same macro block in the previous golden
|
||||
frame.*/
|
||||
#define OC_MODE_GOLDEN_NOMV (5)
|
||||
/*Encoded difference from the previous golden frame offset by the given motion
|
||||
vector.*/
|
||||
#define OC_MODE_GOLDEN_MV (6)
|
||||
/*Encoded difference from the previous frame offset by the individual motion
|
||||
vectors given for each block.*/
|
||||
#define OC_MODE_INTER_MV_FOUR (7)
|
||||
/*The number of (coded) modes.*/
|
||||
#define OC_NMODES (8)
|
||||
|
||||
/*Determines the reference frame used for a given MB mode.*/
|
||||
#define OC_FRAME_FOR_MODE(_x) \
|
||||
OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \
|
||||
OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x))
|
||||
|
||||
/*Constants for the packet state machine common between encoder and decoder.*/
|
||||
|
||||
/*Next packet to emit/read: Codec info header.*/
|
||||
#define OC_PACKET_INFO_HDR (-3)
|
||||
/*Next packet to emit/read: Comment header.*/
|
||||
#define OC_PACKET_COMMENT_HDR (-2)
|
||||
/*Next packet to emit/read: Codec setup header.*/
|
||||
#define OC_PACKET_SETUP_HDR (-1)
|
||||
/*No more packets to emit/read.*/
|
||||
#define OC_PACKET_DONE (INT_MAX)
|
||||
|
||||
|
||||
|
||||
/*Super blocks are 32x32 segments of pixels in a single color plane indexed
|
||||
in image order.
|
||||
Internally, super blocks are broken up into four quadrants, each of which
|
||||
contains a 2x2 pattern of blocks, each of which is an 8x8 block of pixels.
|
||||
Quadrants, and the blocks within them, are indexed in a special order called
|
||||
a "Hilbert curve" within the super block.
|
||||
|
||||
In order to differentiate between the Hilbert-curve indexing strategy and
|
||||
the regular image order indexing strategy, blocks indexed in image order
|
||||
are called "fragments".
|
||||
Fragments are indexed in image order, left to right, then bottom to top,
|
||||
from Y' plane to Cb plane to Cr plane.
|
||||
|
||||
The co-located fragments in all image planes corresponding to the location
|
||||
of a single quadrant of a luma plane super block form a macro block.
|
||||
Thus there is only a single set of macro blocks for all planes, each of which
|
||||
contains between 6 and 12 fragments, depending on the pixel format.
|
||||
Therefore macro block information is kept in a separate set of arrays from
|
||||
super blocks to avoid unused space in the other planes.
|
||||
The lists are indexed in super block order.
|
||||
That is, the macro block corresponding to the macro block mbi in (luma plane)
|
||||
super block sbi is at index (sbi<<2|mbi).
|
||||
Thus the number of macro blocks in each dimension is always twice the number
|
||||
of super blocks, even when only an odd number fall inside the coded frame.
|
||||
These "extra" macro blocks are just an artifact of our internal data layout,
|
||||
and not part of the coded stream; they are flagged with a negative MB mode.*/
|
||||
|
||||
|
||||
|
||||
/*A single quadrant of the map from a super block to fragment numbers.*/
|
||||
typedef ptrdiff_t oc_sb_map_quad[4];
|
||||
/*A map from a super block to fragment numbers.*/
|
||||
typedef oc_sb_map_quad oc_sb_map[4];
|
||||
/*A single plane of the map from a macro block to fragment numbers.*/
|
||||
typedef ptrdiff_t oc_mb_map_plane[4];
|
||||
/*A map from a macro block to fragment numbers.*/
|
||||
typedef oc_mb_map_plane oc_mb_map[3];
|
||||
/*A motion vector.*/
|
||||
typedef signed char oc_mv[2];
|
||||
|
||||
|
||||
|
||||
/*Super block information.*/
|
||||
struct oc_sb_flags{
|
||||
unsigned char coded_fully:1;
|
||||
unsigned char coded_partially:1;
|
||||
unsigned char quad_valid:4;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Information about a fragment which intersects the border of the displayable
|
||||
region.
|
||||
This marks which pixels belong to the displayable region.*/
|
||||
struct oc_border_info{
|
||||
/*A bit mask marking which pixels are in the displayable region.
|
||||
Pixel (x,y) corresponds to bit (y<<3|x).*/
|
||||
ogg_int64_t mask;
|
||||
/*The number of pixels in the displayable region.
|
||||
This is always positive, and always less than 64.*/
|
||||
int npixels;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Fragment information.*/
|
||||
struct oc_fragment{
|
||||
/*A flag indicating whether or not this fragment is coded.*/
|
||||
unsigned coded:1;
|
||||
/*A flag indicating that this entire fragment lies outside the displayable
|
||||
region of the frame.
|
||||
Note the contrast with an invalid macro block, which is outside the coded
|
||||
frame, not just the displayable one.
|
||||
There are no fragments outside the coded frame by construction.*/
|
||||
unsigned invalid:1;
|
||||
/*The index of the quality index used for this fragment's AC coefficients.*/
|
||||
unsigned qii:6;
|
||||
/*The mode of the macroblock this fragment belongs to.*/
|
||||
unsigned mb_mode:3;
|
||||
/*The index of the associated border information for fragments which lie
|
||||
partially outside the displayable region.
|
||||
For fragments completely inside or outside this region, this is -1.
|
||||
Note that the C standard requires an explicit signed keyword for bitfield
|
||||
types, since some compilers may treat them as unsigned without it.*/
|
||||
signed int borderi:5;
|
||||
/*The prediction-corrected DC component.
|
||||
Note that the C standard requires an explicit signed keyword for bitfield
|
||||
types, since some compilers may treat them as unsigned without it.*/
|
||||
signed int dc:16;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*A description of each fragment plane.*/
|
||||
struct oc_fragment_plane{
|
||||
/*The number of fragments in the horizontal direction.*/
|
||||
int nhfrags;
|
||||
/*The number of fragments in the vertical direction.*/
|
||||
int nvfrags;
|
||||
/*The offset of the first fragment in the plane.*/
|
||||
ptrdiff_t froffset;
|
||||
/*The total number of fragments in the plane.*/
|
||||
ptrdiff_t nfrags;
|
||||
/*The number of super blocks in the horizontal direction.*/
|
||||
unsigned nhsbs;
|
||||
/*The number of super blocks in the vertical direction.*/
|
||||
unsigned nvsbs;
|
||||
/*The offset of the first super block in the plane.*/
|
||||
unsigned sboffset;
|
||||
/*The total number of super blocks in the plane.*/
|
||||
unsigned nsbs;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*The shared (encoder and decoder) functions that have accelerated variants.*/
|
||||
struct oc_base_opt_vtable{
|
||||
void (*frag_copy)(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride);
|
||||
void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t _residue[64]);
|
||||
void (*frag_recon_inter)(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void (*idct8x8)(ogg_int16_t _y[64],int _last_zzi);
|
||||
void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void (*state_frag_copy_list)(const oc_theora_state *_state,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
||||
int _dst_frame,int _src_frame,int _pli);
|
||||
void (*state_loop_filter_frag_rows)(const oc_theora_state *_state,
|
||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
void (*restore_fpu)(void);
|
||||
};
|
||||
|
||||
/*The shared (encoder and decoder) tables that vary according to which variants
|
||||
of the above functions are used.*/
|
||||
struct oc_base_opt_data{
|
||||
const unsigned char *dct_fzig_zag;
|
||||
};
|
||||
|
||||
|
||||
/*State information common to both the encoder and decoder.*/
|
||||
struct oc_theora_state{
|
||||
/*The stream information.*/
|
||||
th_info info;
|
||||
/*Table for shared accelerated functions.*/
|
||||
oc_base_opt_vtable opt_vtable;
|
||||
/*Table for shared data used by accelerated functions.*/
|
||||
oc_base_opt_data opt_data;
|
||||
/*CPU flags to detect the presence of extended instruction sets.*/
|
||||
ogg_uint32_t cpu_flags;
|
||||
/*The fragment plane descriptions.*/
|
||||
oc_fragment_plane fplanes[3];
|
||||
/*The list of fragments, indexed in image order.*/
|
||||
oc_fragment *frags;
|
||||
/*The the offset into the reference frame buffer to the upper-left pixel of
|
||||
each fragment.*/
|
||||
ptrdiff_t *frag_buf_offs;
|
||||
/*The motion vector for each fragment.*/
|
||||
oc_mv *frag_mvs;
|
||||
/*The total number of fragments in a single frame.*/
|
||||
ptrdiff_t nfrags;
|
||||
/*The list of super block maps, indexed in image order.*/
|
||||
oc_sb_map *sb_maps;
|
||||
/*The list of super block flags, indexed in image order.*/
|
||||
oc_sb_flags *sb_flags;
|
||||
/*The total number of super blocks in a single frame.*/
|
||||
unsigned nsbs;
|
||||
/*The fragments from each color plane that belong to each macro block.
|
||||
Fragments are stored in image order (left to right then top to bottom).
|
||||
When chroma components are decimated, the extra fragments have an index of
|
||||
-1.*/
|
||||
oc_mb_map *mb_maps;
|
||||
/*The list of macro block modes.
|
||||
A negative number indicates the macro block lies entirely outside the
|
||||
coded frame.*/
|
||||
signed char *mb_modes;
|
||||
/*The number of macro blocks in the X direction.*/
|
||||
unsigned nhmbs;
|
||||
/*The number of macro blocks in the Y direction.*/
|
||||
unsigned nvmbs;
|
||||
/*The total number of macro blocks.*/
|
||||
size_t nmbs;
|
||||
/*The list of coded fragments, in coded order.
|
||||
Uncoded fragments are stored in reverse order from the end of the list.*/
|
||||
ptrdiff_t *coded_fragis;
|
||||
/*The number of coded fragments in each plane.*/
|
||||
ptrdiff_t ncoded_fragis[3];
|
||||
/*The total number of coded fragments.*/
|
||||
ptrdiff_t ntotal_coded_fragis;
|
||||
/*The index of the buffers being used for each OC_FRAME_* reference frame.*/
|
||||
int ref_frame_idx[4];
|
||||
/*The actual buffers used for the previously decoded frames.*/
|
||||
th_ycbcr_buffer ref_frame_bufs[4];
|
||||
/*The storage for the reference frame buffers.*/
|
||||
unsigned char *ref_frame_data[4];
|
||||
/*The strides for each plane in the reference frames.*/
|
||||
int ref_ystride[3];
|
||||
/*The number of unique border patterns.*/
|
||||
int nborders;
|
||||
/*The unique border patterns for all border fragments.
|
||||
The borderi field of fragments which straddle the border indexes this
|
||||
list.*/
|
||||
oc_border_info borders[16];
|
||||
/*The frame number of the last keyframe.*/
|
||||
ogg_int64_t keyframe_num;
|
||||
/*The frame number of the current frame.*/
|
||||
ogg_int64_t curframe_num;
|
||||
/*The granpos of the current frame.*/
|
||||
ogg_int64_t granpos;
|
||||
/*The type of the current frame.*/
|
||||
unsigned char frame_type;
|
||||
/*The bias to add to the frame count when computing granule positions.*/
|
||||
unsigned char granpos_bias;
|
||||
/*The number of quality indices used in the current frame.*/
|
||||
unsigned char nqis;
|
||||
/*The quality indices of the current frame.*/
|
||||
unsigned char qis[3];
|
||||
/*The dequantization tables, stored in zig-zag order, and indexed by
|
||||
qi, pli, qti, and zzi.*/
|
||||
ogg_uint16_t *dequant_tables[64][3][2];
|
||||
OC_ALIGN16(oc_quant_table dequant_table_data[64][3][2]);
|
||||
/*Loop filter strength parameters.*/
|
||||
unsigned char loop_filter_limits[64];
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*The function type used to fill in the chroma plane motion vectors for a
|
||||
macro block when 4 different motion vectors are specified in the luma
|
||||
plane.
|
||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||
_lmbmv: The luma macro-block level motion vector to fill in for use in
|
||||
prediction.
|
||||
_lbmvs: The luma block-level motion vectors.*/
|
||||
typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]);
|
||||
|
||||
|
||||
|
||||
/*A map from the index in the zig zag scan to the coefficient number in a
|
||||
@ -409,14 +100,12 @@ extern const unsigned char OC_MB_MAP_IDXS[TH_PF_NFORMATS][12];
|
||||
/*The number of indices in the oc_mb_map array that can be valid for each of
|
||||
the various chroma decimation types.*/
|
||||
extern const unsigned char OC_MB_MAP_NIDXS[TH_PF_NFORMATS];
|
||||
/*A table of functions used to fill in the Cb,Cr plane motion vectors for a
|
||||
macro block when 4 different motion vectors are specified in the luma
|
||||
plane.*/
|
||||
extern const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS];
|
||||
|
||||
|
||||
|
||||
int oc_ilog(unsigned _v);
|
||||
void *oc_aligned_malloc(size_t _sz,size_t _align);
|
||||
void oc_aligned_free(void *_ptr);
|
||||
void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz);
|
||||
void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz);
|
||||
void oc_free_2d(void *_ptr);
|
||||
@ -424,86 +113,4 @@ void oc_free_2d(void *_ptr);
|
||||
void oc_ycbcr_buffer_flip(th_ycbcr_buffer _dst,
|
||||
const th_ycbcr_buffer _src);
|
||||
|
||||
int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
|
||||
void oc_state_clear(oc_theora_state *_state);
|
||||
void oc_state_vtable_init_c(oc_theora_state *_state);
|
||||
void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli,
|
||||
int _y0,int _yend);
|
||||
void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli);
|
||||
void oc_state_borders_fill(oc_theora_state *_state,int _refi);
|
||||
void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx,
|
||||
th_ycbcr_buffer _img);
|
||||
int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
|
||||
int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
||||
int _pli,int _dx,int _dy);
|
||||
|
||||
int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
|
||||
void oc_state_loop_filter(oc_theora_state *_state,int _frame);
|
||||
#if defined(OC_DUMP_IMAGES)
|
||||
int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
|
||||
const char *_suf);
|
||||
#endif
|
||||
|
||||
/*Shared accelerated functions.*/
|
||||
void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride);
|
||||
void oc_frag_recon_intra(const oc_theora_state *_state,
|
||||
unsigned char *_dst,int _dst_ystride,const ogg_int16_t _residue[64]);
|
||||
void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void oc_frag_recon_inter2(const oc_theora_state *_state,
|
||||
unsigned char *_dst,const unsigned char *_src1,const unsigned char *_src2,
|
||||
int _ystride,const ogg_int16_t _residue[64]);
|
||||
void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],int _last_zzi);
|
||||
void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_state_frag_copy_list(const oc_theora_state *_state,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
||||
int _dst_frame,int _src_frame,int _pli);
|
||||
void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,
|
||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
void oc_restore_fpu(const oc_theora_state *_state);
|
||||
|
||||
/*Default pure-C implementations.*/
|
||||
void oc_frag_copy_c(unsigned char *_dst,
|
||||
const unsigned char *_src,int _src_ystride);
|
||||
void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
|
||||
const ogg_int16_t _residue[64]);
|
||||
void oc_frag_recon_inter_c(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi);
|
||||
void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_state_frag_copy_list_c(const oc_theora_state *_state,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
||||
int _dst_frame,int _src_frame,int _pli);
|
||||
void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
|
||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
void oc_restore_fpu_c(void);
|
||||
|
||||
/*We need a way to call a few encoder functions without introducing a link-time
|
||||
dependency into the decoder, while still allowing the old alpha API which
|
||||
does not distinguish between encoder and decoder objects to be used.
|
||||
We do this by placing a function table at the start of the encoder object
|
||||
which can dispatch into the encoder library.
|
||||
We do a similar thing for the decoder in case we ever decide to split off a
|
||||
common base library.*/
|
||||
typedef void (*oc_state_clear_func)(theora_state *_th);
|
||||
typedef int (*oc_state_control_func)(theora_state *th,int _req,
|
||||
void *_buf,size_t _buf_sz);
|
||||
typedef ogg_int64_t (*oc_state_granule_frame_func)(theora_state *_th,
|
||||
ogg_int64_t _granulepos);
|
||||
typedef double (*oc_state_granule_time_func)(theora_state *_th,
|
||||
ogg_int64_t _granulepos);
|
||||
|
||||
|
||||
struct oc_state_dispatch_vtable{
|
||||
oc_state_clear_func clear;
|
||||
oc_state_control_func control;
|
||||
oc_state_granule_frame_func granule_frame;
|
||||
oc_state_granule_time_func granule_time;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -2,29 +2,27 @@
|
||||
# define _mathops_H (1)
|
||||
# include <ogg/ogg.h>
|
||||
|
||||
# ifdef __GNUC_PREREQ
|
||||
# if __GNUC_PREREQ(3,4)
|
||||
# include <limits.h>
|
||||
# if __GNUC_PREREQ(3,4)
|
||||
# include <limits.h>
|
||||
/*Note the casts to (int) below: this prevents OC_CLZ{32|64}_OFFS from
|
||||
"upgrading" the type of an entire expression to an (unsigned) size_t.*/
|
||||
# if INT_MAX>=2147483647
|
||||
# define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
|
||||
# define OC_CLZ32(_x) (__builtin_clz(_x))
|
||||
# elif LONG_MAX>=2147483647L
|
||||
# define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
|
||||
# define OC_CLZ32(_x) (__builtin_clzl(_x))
|
||||
# endif
|
||||
# if INT_MAX>=9223372036854775807LL
|
||||
# define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
|
||||
# define OC_CLZ64(_x) (__builtin_clz(_x))
|
||||
# elif LONG_MAX>=9223372036854775807LL
|
||||
# define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
|
||||
# define OC_CLZ64(_x) (__builtin_clzl(_x))
|
||||
# elif LLONG_MAX>=9223372036854775807LL|| \
|
||||
__LONG_LONG_MAX__>=9223372036854775807LL
|
||||
# define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT)
|
||||
# define OC_CLZ64(_x) (__builtin_clzll(_x))
|
||||
# endif
|
||||
# if INT_MAX>=2147483647
|
||||
# define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
|
||||
# define OC_CLZ32(_x) (__builtin_clz(_x))
|
||||
# elif LONG_MAX>=2147483647L
|
||||
# define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
|
||||
# define OC_CLZ32(_x) (__builtin_clzl(_x))
|
||||
# endif
|
||||
# if INT_MAX>=9223372036854775807LL
|
||||
# define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
|
||||
# define OC_CLZ64(_x) (__builtin_clz(_x))
|
||||
# elif LONG_MAX>=9223372036854775807LL
|
||||
# define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
|
||||
# define OC_CLZ64(_x) (__builtin_clzl(_x))
|
||||
# elif LLONG_MAX>=9223372036854775807LL|| \
|
||||
__LONG_LONG_MAX__>=9223372036854775807LL
|
||||
# define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT)
|
||||
# define OC_CLZ64(_x) (__builtin_clzll(_x))
|
||||
# endif
|
||||
# endif
|
||||
|
||||
@ -134,8 +132,12 @@ int oc_ilog64(ogg_int64_t _v);
|
||||
# define OC_STATIC_ILOG_64(_v) (OC_STATIC_ILOG6((ogg_int64_t)(_v)))
|
||||
|
||||
#define OC_Q57(_v) ((ogg_int64_t)(_v)<<57)
|
||||
#define OC_Q10(_v) ((_v)<<10)
|
||||
|
||||
ogg_int64_t oc_bexp64(ogg_int64_t _z);
|
||||
ogg_int64_t oc_blog64(ogg_int64_t _w);
|
||||
|
||||
ogg_uint32_t oc_bexp32_q10(int _z);
|
||||
int oc_blog32_q10(ogg_uint32_t _w);
|
||||
|
||||
#endif
|
||||
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: quant.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: quant.c 17307 2010-06-27 06:02:15Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -21,6 +21,14 @@
|
||||
#include "quant.h"
|
||||
#include "decint.h"
|
||||
|
||||
/*The maximum output of the DCT with +/- 255 inputs is +/- 8157.
|
||||
These minimum quantizers ensure the result after quantization (and after
|
||||
prediction for DC) will be no more than +/- 510.
|
||||
The tokenization system can handle values up to +/- 580, so there is no need
|
||||
to do any coefficient clamping.
|
||||
I would rather have allowed smaller quantizers and had to clamp, but these
|
||||
minimums were required when constructing the original VP3 matrices and have
|
||||
been formalized in the spec.*/
|
||||
static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
|
||||
static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
|
||||
|
||||
|
@ -11,25 +11,92 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: state.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: state.c 17576 2010-10-29 01:07:51Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "internal.h"
|
||||
#if defined(OC_X86_ASM)
|
||||
#if defined(_MSC_VER)
|
||||
# include "x86_vc/x86int.h"
|
||||
#else
|
||||
# include "x86/x86int.h"
|
||||
#endif
|
||||
#endif
|
||||
#include "state.h"
|
||||
#if defined(OC_DUMP_IMAGES)
|
||||
# include <stdio.h>
|
||||
# include "png.h"
|
||||
#endif
|
||||
|
||||
/*The function used to fill in the chroma plane motion vectors for a macro
|
||||
block when 4 different motion vectors are specified in the luma plane.
|
||||
This version is for use with chroma decimated in the X and Y directions
|
||||
(4:2:0).
|
||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||
_lbmvs: The luma block-level motion vectors.*/
|
||||
static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
||||
int dx;
|
||||
int dy;
|
||||
dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[1])
|
||||
+OC_MV_X(_lbmvs[2])+OC_MV_X(_lbmvs[3]);
|
||||
dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[1])
|
||||
+OC_MV_Y(_lbmvs[2])+OC_MV_Y(_lbmvs[3]);
|
||||
_cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,2,2),OC_DIV_ROUND_POW2(dy,2,2));
|
||||
}
|
||||
|
||||
/*The function used to fill in the chroma plane motion vectors for a macro
|
||||
block when 4 different motion vectors are specified in the luma plane.
|
||||
This version is for use with chroma decimated in the Y direction.
|
||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||
_lbmvs: The luma block-level motion vectors.*/
|
||||
static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
||||
int dx;
|
||||
int dy;
|
||||
dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[2]);
|
||||
dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[2]);
|
||||
_cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
|
||||
dx=OC_MV_X(_lbmvs[1])+OC_MV_X(_lbmvs[3]);
|
||||
dy=OC_MV_Y(_lbmvs[1])+OC_MV_Y(_lbmvs[3]);
|
||||
_cbmvs[1]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
|
||||
}
|
||||
|
||||
/*The function used to fill in the chroma plane motion vectors for a macro
|
||||
block when 4 different motion vectors are specified in the luma plane.
|
||||
This version is for use with chroma decimated in the X direction (4:2:2).
|
||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||
_lbmvs: The luma block-level motion vectors.*/
|
||||
static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
||||
int dx;
|
||||
int dy;
|
||||
dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[1]);
|
||||
dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[1]);
|
||||
_cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
|
||||
dx=OC_MV_X(_lbmvs[2])+OC_MV_X(_lbmvs[3]);
|
||||
dy=OC_MV_Y(_lbmvs[2])+OC_MV_Y(_lbmvs[3]);
|
||||
_cbmvs[2]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
|
||||
}
|
||||
|
||||
/*The function used to fill in the chroma plane motion vectors for a macro
|
||||
block when 4 different motion vectors are specified in the luma plane.
|
||||
This version is for use with no chroma decimation (4:4:4).
|
||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||
_lmbmv: The luma macro-block level motion vector to fill in for use in
|
||||
prediction.
|
||||
_lbmvs: The luma block-level motion vectors.*/
|
||||
static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
||||
_cbmvs[0]=_lbmvs[0];
|
||||
_cbmvs[1]=_lbmvs[1];
|
||||
_cbmvs[2]=_lbmvs[2];
|
||||
_cbmvs[3]=_lbmvs[3];
|
||||
}
|
||||
|
||||
/*A table of functions used to fill in the chroma plane motion vectors for a
|
||||
macro block when 4 different motion vectors are specified in the luma
|
||||
plane.*/
|
||||
const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]={
|
||||
(oc_set_chroma_mvs_func)oc_set_chroma_mvs00,
|
||||
(oc_set_chroma_mvs_func)oc_set_chroma_mvs01,
|
||||
(oc_set_chroma_mvs_func)oc_set_chroma_mvs10,
|
||||
(oc_set_chroma_mvs_func)oc_set_chroma_mvs11
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Returns the fragment index of the top-left block in a macro block.
|
||||
This can be used to test whether or not the whole macro block is valid.
|
||||
_sb_map: The super block map.
|
||||
@ -469,7 +536,7 @@ static void oc_state_frarray_clear(oc_theora_state *_state){
|
||||
unrestricted motion vectors without special casing the boundary.
|
||||
If chroma is decimated in either direction, the padding is reduced by a
|
||||
factor of 2 on the appropriate sides.
|
||||
_nrefs: The number of reference buffers to init; must be 3 or 4.*/
|
||||
_nrefs: The number of reference buffers to init; must be in the range 3...6.*/
|
||||
static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
|
||||
th_info *info;
|
||||
unsigned char *ref_frame_data;
|
||||
@ -481,6 +548,7 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
|
||||
int yheight;
|
||||
int chstride;
|
||||
int cheight;
|
||||
ptrdiff_t align;
|
||||
ptrdiff_t yoffset;
|
||||
ptrdiff_t coffset;
|
||||
ptrdiff_t *frag_buf_offs;
|
||||
@ -489,28 +557,33 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
|
||||
int vdec;
|
||||
int rfi;
|
||||
int pli;
|
||||
if(_nrefs<3||_nrefs>4)return TH_EINVAL;
|
||||
if(_nrefs<3||_nrefs>6)return TH_EINVAL;
|
||||
info=&_state->info;
|
||||
/*Compute the image buffer parameters for each plane.*/
|
||||
hdec=!(info->pixel_fmt&1);
|
||||
vdec=!(info->pixel_fmt&2);
|
||||
yhstride=info->frame_width+2*OC_UMV_PADDING;
|
||||
yheight=info->frame_height+2*OC_UMV_PADDING;
|
||||
chstride=yhstride>>hdec;
|
||||
/*Require 16-byte aligned rows in the chroma planes.*/
|
||||
chstride=(yhstride>>hdec)+15&~15;
|
||||
cheight=yheight>>vdec;
|
||||
yplane_sz=yhstride*(size_t)yheight;
|
||||
cplane_sz=chstride*(size_t)cheight;
|
||||
yoffset=OC_UMV_PADDING+OC_UMV_PADDING*(ptrdiff_t)yhstride;
|
||||
coffset=(OC_UMV_PADDING>>hdec)+(OC_UMV_PADDING>>vdec)*(ptrdiff_t)chstride;
|
||||
ref_frame_sz=yplane_sz+2*cplane_sz;
|
||||
/*Although we guarantee the rows of the chroma planes are a multiple of 16
|
||||
bytes, the initial padding on the first row may only be 8 bytes.
|
||||
Compute the offset needed to the actual image data to a multiple of 16.*/
|
||||
align=-coffset&15;
|
||||
ref_frame_sz=yplane_sz+2*cplane_sz+16;
|
||||
ref_frame_data_sz=_nrefs*ref_frame_sz;
|
||||
/*Check for overflow.
|
||||
The same caveats apply as for oc_state_frarray_init().*/
|
||||
if(yplane_sz/yhstride!=yheight||2*cplane_sz<cplane_sz||
|
||||
if(yplane_sz/yhstride!=(size_t)yheight||2*cplane_sz+16<cplane_sz||
|
||||
ref_frame_sz<yplane_sz||ref_frame_data_sz/_nrefs!=ref_frame_sz){
|
||||
return TH_EIMPL;
|
||||
}
|
||||
ref_frame_data=_ogg_malloc(ref_frame_data_sz);
|
||||
ref_frame_data=oc_aligned_malloc(ref_frame_data_sz,16);
|
||||
frag_buf_offs=_state->frag_buf_offs=
|
||||
_ogg_malloc(_state->nfrags*sizeof(*frag_buf_offs));
|
||||
if(ref_frame_data==NULL||frag_buf_offs==NULL){
|
||||
@ -532,15 +605,15 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
|
||||
memcpy(_state->ref_frame_bufs[rfi],_state->ref_frame_bufs[0],
|
||||
sizeof(_state->ref_frame_bufs[0]));
|
||||
}
|
||||
_state->ref_frame_handle=ref_frame_data;
|
||||
/*Set up the data pointers for the image buffers.*/
|
||||
for(rfi=0;rfi<_nrefs;rfi++){
|
||||
_state->ref_frame_data[rfi]=ref_frame_data;
|
||||
_state->ref_frame_bufs[rfi][0].data=ref_frame_data+yoffset;
|
||||
ref_frame_data+=yplane_sz;
|
||||
ref_frame_data+=yplane_sz+align;
|
||||
_state->ref_frame_bufs[rfi][1].data=ref_frame_data+coffset;
|
||||
ref_frame_data+=cplane_sz;
|
||||
_state->ref_frame_bufs[rfi][2].data=ref_frame_data+coffset;
|
||||
ref_frame_data+=cplane_sz;
|
||||
ref_frame_data+=cplane_sz+(16-align);
|
||||
/*Flip the buffer upside down.
|
||||
This allows us to decode Theora's bottom-up frames in their natural
|
||||
order, yet return a top-down buffer with a positive stride to the user.*/
|
||||
@ -550,7 +623,7 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
|
||||
_state->ref_ystride[0]=-yhstride;
|
||||
_state->ref_ystride[1]=_state->ref_ystride[2]=-chstride;
|
||||
/*Initialize the fragment buffer offsets.*/
|
||||
ref_frame_data=_state->ref_frame_data[0];
|
||||
ref_frame_data=_state->ref_frame_bufs[0][0].data;
|
||||
fragi=0;
|
||||
for(pli=0;pli<3;pli++){
|
||||
th_img_plane *iplane;
|
||||
@ -576,41 +649,44 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
|
||||
vpix+=stride<<3;
|
||||
}
|
||||
}
|
||||
/*Initialize the reference frame indices.*/
|
||||
/*Initialize the reference frame pointers and indices.*/
|
||||
_state->ref_frame_idx[OC_FRAME_GOLD]=
|
||||
_state->ref_frame_idx[OC_FRAME_PREV]=
|
||||
_state->ref_frame_idx[OC_FRAME_SELF]=-1;
|
||||
_state->ref_frame_idx[OC_FRAME_IO]=_nrefs>3?3:-1;
|
||||
_state->ref_frame_idx[OC_FRAME_GOLD_ORIG]=
|
||||
_state->ref_frame_idx[OC_FRAME_PREV_ORIG]=
|
||||
_state->ref_frame_idx[OC_FRAME_SELF]=
|
||||
_state->ref_frame_idx[OC_FRAME_IO]=-1;
|
||||
_state->ref_frame_data[OC_FRAME_GOLD]=
|
||||
_state->ref_frame_data[OC_FRAME_PREV]=
|
||||
_state->ref_frame_data[OC_FRAME_GOLD_ORIG]=
|
||||
_state->ref_frame_data[OC_FRAME_PREV_ORIG]=
|
||||
_state->ref_frame_data[OC_FRAME_SELF]=
|
||||
_state->ref_frame_data[OC_FRAME_IO]=NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void oc_state_ref_bufs_clear(oc_theora_state *_state){
|
||||
_ogg_free(_state->frag_buf_offs);
|
||||
_ogg_free(_state->ref_frame_data[0]);
|
||||
oc_aligned_free(_state->ref_frame_handle);
|
||||
}
|
||||
|
||||
|
||||
void oc_state_vtable_init_c(oc_theora_state *_state){
|
||||
void oc_state_accel_init_c(oc_theora_state *_state){
|
||||
_state->cpu_flags=0;
|
||||
#if defined(OC_STATE_USE_VTABLE)
|
||||
_state->opt_vtable.frag_copy=oc_frag_copy_c;
|
||||
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_c;
|
||||
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
|
||||
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
|
||||
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
|
||||
_state->opt_vtable.idct8x8=oc_idct8x8_c;
|
||||
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
|
||||
_state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c;
|
||||
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_c;
|
||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||
oc_state_loop_filter_frag_rows_c;
|
||||
_state->opt_vtable.restore_fpu=oc_restore_fpu_c;
|
||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
|
||||
}
|
||||
|
||||
/*Initialize the accelerated function pointers.*/
|
||||
void oc_state_vtable_init(oc_theora_state *_state){
|
||||
#if defined(OC_X86_ASM)
|
||||
oc_state_vtable_init_x86(_state);
|
||||
#else
|
||||
oc_state_vtable_init_c(_state);
|
||||
#endif
|
||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
|
||||
}
|
||||
|
||||
|
||||
@ -648,7 +724,7 @@ int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){
|
||||
system.*/
|
||||
_state->info.pic_y=_info->frame_height-_info->pic_height-_info->pic_y;
|
||||
_state->frame_type=OC_UNKWN_FRAME;
|
||||
oc_state_vtable_init(_state);
|
||||
oc_state_accel_init(_state);
|
||||
ret=oc_state_frarray_init(_state);
|
||||
if(ret>=0)ret=oc_state_ref_bufs_init(_state,_nrefs);
|
||||
if(ret<0){
|
||||
@ -758,11 +834,10 @@ void oc_state_borders_fill(oc_theora_state *_state,int _refi){
|
||||
_offsets[1] is set if the motion vector has non-zero fractional
|
||||
components.
|
||||
_pli: The color plane index.
|
||||
_dx: The X component of the motion vector.
|
||||
_dy: The Y component of the motion vector.
|
||||
_mv: The motion vector.
|
||||
Return: The number of offsets returned: 1 or 2.*/
|
||||
int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
||||
int _pli,int _dx,int _dy){
|
||||
int _pli,oc_mv _mv){
|
||||
/*Here is a brief description of how Theora handles motion vectors:
|
||||
Motion vector components are specified to half-pixel accuracy in
|
||||
undecimated directions of each plane, and quarter-pixel accuracy in
|
||||
@ -785,21 +860,25 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
||||
int xfrac;
|
||||
int yfrac;
|
||||
int offs;
|
||||
int dx;
|
||||
int dy;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
/*These two variables decide whether we are in half- or quarter-pixel
|
||||
precision in each component.*/
|
||||
xprec=1+(_pli!=0&&!(_state->info.pixel_fmt&1));
|
||||
yprec=1+(_pli!=0&&!(_state->info.pixel_fmt&2));
|
||||
dx=OC_MV_X(_mv);
|
||||
dy=OC_MV_Y(_mv);
|
||||
/*These two variables are either 0 if all the fractional bits are zero or -1
|
||||
if any of them are non-zero.*/
|
||||
xfrac=OC_SIGNMASK(-(_dx&(xprec|1)));
|
||||
yfrac=OC_SIGNMASK(-(_dy&(yprec|1)));
|
||||
offs=(_dx>>xprec)+(_dy>>yprec)*ystride;
|
||||
xfrac=OC_SIGNMASK(-(dx&(xprec|1)));
|
||||
yfrac=OC_SIGNMASK(-(dy&(yprec|1)));
|
||||
offs=(dx>>xprec)+(dy>>yprec)*ystride;
|
||||
if(xfrac||yfrac){
|
||||
int xmask;
|
||||
int ymask;
|
||||
xmask=OC_SIGNMASK(_dx);
|
||||
ymask=OC_SIGNMASK(_dy);
|
||||
xmask=OC_SIGNMASK(dx);
|
||||
ymask=OC_SIGNMASK(dy);
|
||||
yfrac&=ystride;
|
||||
_offsets[0]=offs-(xfrac&xmask)+(yfrac&ymask);
|
||||
_offsets[1]=offs-(xfrac&~xmask)+(yfrac&~ymask);
|
||||
@ -848,13 +927,17 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
||||
int mx2;
|
||||
int my2;
|
||||
int offs;
|
||||
int dx;
|
||||
int dy;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
qpy=_pli!=0&&!(_state->info.pixel_fmt&2);
|
||||
my=OC_MVMAP[qpy][_dy+31];
|
||||
my2=OC_MVMAP2[qpy][_dy+31];
|
||||
dx=OC_MV_X(_mv);
|
||||
dy=OC_MV_Y(_mv);
|
||||
my=OC_MVMAP[qpy][dy+31];
|
||||
my2=OC_MVMAP2[qpy][dy+31];
|
||||
qpx=_pli!=0&&!(_state->info.pixel_fmt&1);
|
||||
mx=OC_MVMAP[qpx][_dx+31];
|
||||
mx2=OC_MVMAP2[qpx][_dx+31];
|
||||
mx=OC_MVMAP[qpx][dx+31];
|
||||
mx2=OC_MVMAP2[qpx][dx+31];
|
||||
offs=my*ystride+mx;
|
||||
if(mx2||my2){
|
||||
_offsets[1]=offs+my2*ystride+mx2;
|
||||
@ -866,18 +949,12 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
||||
#endif
|
||||
}
|
||||
|
||||
void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||
_state->opt_vtable.state_frag_recon(_state,_fragi,_pli,_dct_coeffs,
|
||||
_last_zzi,_dc_quant);
|
||||
}
|
||||
|
||||
void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||
unsigned char *dst;
|
||||
ptrdiff_t frag_buf_off;
|
||||
int ystride;
|
||||
int mb_mode;
|
||||
int refi;
|
||||
/*Apply the inverse transform.*/
|
||||
/*Special case only having a DC component.*/
|
||||
if(_last_zzi<2){
|
||||
@ -887,69 +964,35 @@ void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
no iDCT rounding.*/
|
||||
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
|
||||
/*LOOP VECTORIZES.*/
|
||||
for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
|
||||
for(ci=0;ci<64;ci++)_dct_coeffs[64+ci]=p;
|
||||
}
|
||||
else{
|
||||
/*First, dequantize the DC coefficient.*/
|
||||
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
||||
oc_idct8x8(_state,_dct_coeffs,_last_zzi);
|
||||
oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
|
||||
}
|
||||
/*Fill in the target buffer.*/
|
||||
frag_buf_off=_state->frag_buf_offs[_fragi];
|
||||
mb_mode=_state->frags[_fragi].mb_mode;
|
||||
refi=_state->frags[_fragi].refi;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
|
||||
if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs);
|
||||
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
|
||||
if(refi==OC_FRAME_SELF)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs+64);
|
||||
else{
|
||||
const unsigned char *ref;
|
||||
int mvoffsets[2];
|
||||
ref=
|
||||
_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
|
||||
+frag_buf_off;
|
||||
ref=_state->ref_frame_data[refi]+frag_buf_off;
|
||||
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
||||
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
|
||||
_state->frag_mvs[_fragi])>1){
|
||||
oc_frag_recon_inter2(_state,
|
||||
dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs);
|
||||
dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs+64);
|
||||
}
|
||||
else{
|
||||
oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
|
||||
}
|
||||
else oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs);
|
||||
}
|
||||
}
|
||||
|
||||
/*Copies the fragments specified by the lists of fragment indices from one
|
||||
frame to another.
|
||||
_fragis: A pointer to a list of fragment indices.
|
||||
_nfragis: The number of fragment indices to copy.
|
||||
_dst_frame: The reference frame to copy to.
|
||||
_src_frame: The reference frame to copy from.
|
||||
_pli: The color plane the fragments lie in.*/
|
||||
void oc_state_frag_copy_list(const oc_theora_state *_state,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
||||
int _dst_frame,int _src_frame,int _pli){
|
||||
_state->opt_vtable.state_frag_copy_list(_state,_fragis,_nfragis,_dst_frame,
|
||||
_src_frame,_pli);
|
||||
}
|
||||
|
||||
void oc_state_frag_copy_list_c(const oc_theora_state *_state,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
||||
int _dst_frame,int _src_frame,int _pli){
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
const unsigned char *src_frame_data;
|
||||
unsigned char *dst_frame_data;
|
||||
ptrdiff_t fragii;
|
||||
int ystride;
|
||||
dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
|
||||
src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
frag_buf_offs=_state->frag_buf_offs;
|
||||
for(fragii=0;fragii<_nfragis;fragii++){
|
||||
ptrdiff_t frag_buf_off;
|
||||
frag_buf_off=frag_buf_offs[_fragis[fragii]];
|
||||
oc_frag_copy(_state,dst_frame_data+frag_buf_off,
|
||||
src_frame_data+frag_buf_off,ystride);
|
||||
}
|
||||
}
|
||||
|
||||
static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
|
||||
static void loop_filter_h(unsigned char *_pix,int _ystride,signed char *_bv){
|
||||
int y;
|
||||
_pix-=2;
|
||||
for(y=0;y<8;y++){
|
||||
@ -965,7 +1008,7 @@ static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
|
||||
}
|
||||
}
|
||||
|
||||
static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
|
||||
static void loop_filter_v(unsigned char *_pix,int _ystride,signed char *_bv){
|
||||
int x;
|
||||
_pix-=_ystride*2;
|
||||
for(x=0;x<8;x++){
|
||||
@ -982,20 +1025,16 @@ static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
|
||||
|
||||
/*Initialize the bounding values array used by the loop filter.
|
||||
_bv: Storage for the array.
|
||||
Return: 0 on success, or a non-zero value if no filtering need be applied.*/
|
||||
int oc_state_loop_filter_init(oc_theora_state *_state,int _bv[256]){
|
||||
int flimit;
|
||||
_flimit: The filter limit as defined in Section 7.10 of the spec.*/
|
||||
void oc_loop_filter_init_c(signed char _bv[256],int _flimit){
|
||||
int i;
|
||||
flimit=_state->loop_filter_limits[_state->qis[0]];
|
||||
if(flimit==0)return 1;
|
||||
memset(_bv,0,sizeof(_bv[0])*256);
|
||||
for(i=0;i<flimit;i++){
|
||||
if(127-i-flimit>=0)_bv[127-i-flimit]=i-flimit;
|
||||
_bv[127-i]=-i;
|
||||
_bv[127+i]=i;
|
||||
if(127+i+flimit<256)_bv[127+i+flimit]=flimit-i;
|
||||
for(i=0;i<_flimit;i++){
|
||||
if(127-i-_flimit>=0)_bv[127-i-_flimit]=(signed char)(i-_flimit);
|
||||
_bv[127-i]=(signed char)(-i);
|
||||
_bv[127+i]=(signed char)(i);
|
||||
if(127+i+_flimit<256)_bv[127+i+_flimit]=(signed char)(_flimit-i);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
||||
@ -1006,14 +1045,8 @@ int oc_state_loop_filter_init(oc_theora_state *_state,int _bv[256]){
|
||||
_pli: The color plane to filter.
|
||||
_fragy0: The Y coordinate of the first fragment row to filter.
|
||||
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
||||
void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,int _bv[256],
|
||||
int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||
_state->opt_vtable.state_loop_filter_frag_rows(_state,_bv,_refi,_pli,
|
||||
_fragy0,_fragy_end);
|
||||
}
|
||||
|
||||
void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
|
||||
int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||
void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
|
||||
signed char *_bv,int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||
const oc_fragment_plane *fplane;
|
||||
const oc_fragment *frags;
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
@ -1030,7 +1063,7 @@ void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
|
||||
fragi_top=fplane->froffset;
|
||||
fragi_bot=fragi_top+fplane->nfrags;
|
||||
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
|
||||
fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
|
||||
fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
frags=_state->frags;
|
||||
frag_buf_offs=_state->frag_buf_offs;
|
||||
|
552
media/libtheora/lib/state.h
Normal file
552
media/libtheora/lib/state.h
Normal file
@ -0,0 +1,552 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: internal.h 17337 2010-07-19 16:08:54Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
#if !defined(_state_H)
|
||||
# define _state_H (1)
|
||||
# include "internal.h"
|
||||
# include "huffman.h"
|
||||
# include "quant.h"
|
||||
|
||||
|
||||
|
||||
/*A single quadrant of the map from a super block to fragment numbers.*/
|
||||
typedef ptrdiff_t oc_sb_map_quad[4];
|
||||
/*A map from a super block to fragment numbers.*/
|
||||
typedef oc_sb_map_quad oc_sb_map[4];
|
||||
/*A single plane of the map from a macro block to fragment numbers.*/
|
||||
typedef ptrdiff_t oc_mb_map_plane[4];
|
||||
/*A map from a macro block to fragment numbers.*/
|
||||
typedef oc_mb_map_plane oc_mb_map[3];
|
||||
/*A motion vector.*/
|
||||
typedef ogg_int16_t oc_mv;
|
||||
|
||||
typedef struct oc_sb_flags oc_sb_flags;
|
||||
typedef struct oc_border_info oc_border_info;
|
||||
typedef struct oc_fragment oc_fragment;
|
||||
typedef struct oc_fragment_plane oc_fragment_plane;
|
||||
typedef struct oc_base_opt_vtable oc_base_opt_vtable;
|
||||
typedef struct oc_base_opt_data oc_base_opt_data;
|
||||
typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
|
||||
typedef struct oc_theora_state oc_theora_state;
|
||||
|
||||
|
||||
|
||||
/*Shared accelerated functions.*/
|
||||
# if defined(OC_X86_ASM)
|
||||
# if defined(_MSC_VER)
|
||||
# include "x86_vc/x86int.h"
|
||||
# else
|
||||
# include "x86/x86int.h"
|
||||
# endif
|
||||
# endif
|
||||
# if defined(OC_ARM_ASM)
|
||||
# include "arm/armint.h"
|
||||
# endif
|
||||
# if defined(OC_C64X_ASM)
|
||||
# include "c64x/c64xint.h"
|
||||
# endif
|
||||
|
||||
# if !defined(oc_state_accel_init)
|
||||
# define oc_state_accel_init oc_state_accel_init_c
|
||||
# endif
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
# if !defined(oc_frag_copy)
|
||||
# define oc_frag_copy(_state,_dst,_src,_ystride) \
|
||||
((*(_state)->opt_vtable.frag_copy)(_dst,_src,_ystride))
|
||||
# endif
|
||||
# if !defined(oc_frag_copy_list)
|
||||
# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs) \
|
||||
((*(_state)->opt_vtable.frag_copy_list)(_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs))
|
||||
# endif
|
||||
# if !defined(oc_frag_recon_intra)
|
||||
# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
|
||||
((*(_state)->opt_vtable.frag_recon_intra)(_dst,_dst_ystride,_residue))
|
||||
# endif
|
||||
# if !defined(oc_frag_recon_inter)
|
||||
# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
|
||||
((*(_state)->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue))
|
||||
# endif
|
||||
# if !defined(oc_frag_recon_inter2)
|
||||
# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
|
||||
((*(_state)->opt_vtable.frag_recon_inter2)(_dst, \
|
||||
_src1,_src2,_ystride,_residue))
|
||||
# endif
|
||||
# if !defined(oc_idct8x8)
|
||||
# define oc_idct8x8(_state,_y,_x,_last_zzi) \
|
||||
((*(_state)->opt_vtable.idct8x8)(_y,_x,_last_zzi))
|
||||
# endif
|
||||
# if !defined(oc_state_frag_recon)
|
||||
# define oc_state_frag_recon(_state,_fragi, \
|
||||
_pli,_dct_coeffs,_last_zzi,_dc_quant) \
|
||||
((*(_state)->opt_vtable.state_frag_recon)(_state,_fragi, \
|
||||
_pli,_dct_coeffs,_last_zzi,_dc_quant))
|
||||
# endif
|
||||
# if !defined(oc_loop_filter_init)
|
||||
# define oc_loop_filter_init(_state,_bv,_flimit) \
|
||||
((*(_state)->opt_vtable.loop_filter_init)(_bv,_flimit))
|
||||
# endif
|
||||
# if !defined(oc_state_loop_filter_frag_rows)
|
||||
# define oc_state_loop_filter_frag_rows(_state, \
|
||||
_bv,_refi,_pli,_fragy0,_fragy_end) \
|
||||
((*(_state)->opt_vtable.state_loop_filter_frag_rows)(_state, \
|
||||
_bv,_refi,_pli,_fragy0,_fragy_end))
|
||||
# endif
|
||||
# if !defined(oc_restore_fpu)
|
||||
# define oc_restore_fpu(_state) \
|
||||
((*(_state)->opt_vtable.restore_fpu)())
|
||||
# endif
|
||||
# else
|
||||
# if !defined(oc_frag_copy)
|
||||
# define oc_frag_copy(_state,_dst,_src,_ystride) \
|
||||
oc_frag_copy_c(_dst,_src,_ystride)
|
||||
# endif
|
||||
# if !defined(oc_frag_copy_list)
|
||||
# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs) \
|
||||
oc_frag_copy_list_c(_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs)
|
||||
# endif
|
||||
# if !defined(oc_frag_recon_intra)
|
||||
# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
|
||||
oc_frag_recon_intra_c(_dst,_dst_ystride,_residue)
|
||||
# endif
|
||||
# if !defined(oc_frag_recon_inter)
|
||||
# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
|
||||
oc_frag_recon_inter_c(_dst,_src,_ystride,_residue)
|
||||
# endif
|
||||
# if !defined(oc_frag_recon_inter2)
|
||||
# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
|
||||
oc_frag_recon_inter2_c(_dst,_src1,_src2,_ystride,_residue)
|
||||
# endif
|
||||
# if !defined(oc_idct8x8)
|
||||
# define oc_idct8x8(_state,_y,_x,_last_zzi) oc_idct8x8_c(_y,_x,_last_zzi)
|
||||
# endif
|
||||
# if !defined(oc_state_frag_recon)
|
||||
# define oc_state_frag_recon oc_state_frag_recon_c
|
||||
# endif
|
||||
# if !defined(oc_loop_filter_init)
|
||||
# define oc_loop_filter_init(_state,_bv,_flimit) \
|
||||
oc_loop_filter_init_c(_bv,_flimit)
|
||||
# endif
|
||||
# if !defined(oc_state_loop_filter_frag_rows)
|
||||
# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c
|
||||
# endif
|
||||
# if !defined(oc_restore_fpu)
|
||||
# define oc_restore_fpu(_state) do{}while(0)
|
||||
# endif
|
||||
# endif
|
||||
|
||||
|
||||
|
||||
/*A keyframe.*/
|
||||
# define OC_INTRA_FRAME (0)
|
||||
/*A predicted frame.*/
|
||||
# define OC_INTER_FRAME (1)
|
||||
/*A frame of unknown type (frame type decision has not yet been made).*/
|
||||
# define OC_UNKWN_FRAME (-1)
|
||||
|
||||
/*The amount of padding to add to the reconstructed frame buffers on all
|
||||
sides.
|
||||
This is used to allow unrestricted motion vectors without special casing.
|
||||
This must be a multiple of 2.*/
|
||||
# define OC_UMV_PADDING (16)
|
||||
|
||||
/*Frame classification indices.*/
|
||||
/*The previous golden frame.*/
|
||||
# define OC_FRAME_GOLD (0)
|
||||
/*The previous frame.*/
|
||||
# define OC_FRAME_PREV (1)
|
||||
/*The current frame.*/
|
||||
# define OC_FRAME_SELF (2)
|
||||
/*Used to mark uncoded fragments (for DC prediction).*/
|
||||
# define OC_FRAME_NONE (3)
|
||||
|
||||
/*The input or output buffer.*/
|
||||
# define OC_FRAME_IO (3)
|
||||
/*Uncompressed prev golden frame.*/
|
||||
# define OC_FRAME_GOLD_ORIG (4)
|
||||
/*Uncompressed previous frame. */
|
||||
# define OC_FRAME_PREV_ORIG (5)
|
||||
|
||||
/*Macroblock modes.*/
|
||||
/*Macro block is invalid: It is never coded.*/
|
||||
# define OC_MODE_INVALID (-1)
|
||||
/*Encoded difference from the same macro block in the previous frame.*/
|
||||
# define OC_MODE_INTER_NOMV (0)
|
||||
/*Encoded with no motion compensated prediction.*/
|
||||
# define OC_MODE_INTRA (1)
|
||||
/*Encoded difference from the previous frame offset by the given motion
|
||||
vector.*/
|
||||
# define OC_MODE_INTER_MV (2)
|
||||
/*Encoded difference from the previous frame offset by the last coded motion
|
||||
vector.*/
|
||||
# define OC_MODE_INTER_MV_LAST (3)
|
||||
/*Encoded difference from the previous frame offset by the second to last
|
||||
coded motion vector.*/
|
||||
# define OC_MODE_INTER_MV_LAST2 (4)
|
||||
/*Encoded difference from the same macro block in the previous golden
|
||||
frame.*/
|
||||
# define OC_MODE_GOLDEN_NOMV (5)
|
||||
/*Encoded difference from the previous golden frame offset by the given motion
|
||||
vector.*/
|
||||
# define OC_MODE_GOLDEN_MV (6)
|
||||
/*Encoded difference from the previous frame offset by the individual motion
|
||||
vectors given for each block.*/
|
||||
# define OC_MODE_INTER_MV_FOUR (7)
|
||||
/*The number of (coded) modes.*/
|
||||
# define OC_NMODES (8)
|
||||
|
||||
/*Determines the reference frame used for a given MB mode.*/
|
||||
# define OC_FRAME_FOR_MODE(_x) \
|
||||
OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \
|
||||
OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x))
|
||||
|
||||
/*Constants for the packet state machine common between encoder and decoder.*/
|
||||
|
||||
/*Next packet to emit/read: Codec info header.*/
|
||||
# define OC_PACKET_INFO_HDR (-3)
|
||||
/*Next packet to emit/read: Comment header.*/
|
||||
# define OC_PACKET_COMMENT_HDR (-2)
|
||||
/*Next packet to emit/read: Codec setup header.*/
|
||||
# define OC_PACKET_SETUP_HDR (-1)
|
||||
/*No more packets to emit/read.*/
|
||||
# define OC_PACKET_DONE (INT_MAX)
|
||||
|
||||
|
||||
|
||||
#define OC_MV(_x,_y) ((oc_mv)((_x)&0xFF|(_y)<<8))
|
||||
#define OC_MV_X(_mv) ((signed char)(_mv))
|
||||
#define OC_MV_Y(_mv) ((_mv)>>8)
|
||||
#define OC_MV_ADD(_mv1,_mv2) \
|
||||
OC_MV(OC_MV_X(_mv1)+OC_MV_X(_mv2), \
|
||||
OC_MV_Y(_mv1)+OC_MV_Y(_mv2))
|
||||
#define OC_MV_SUB(_mv1,_mv2) \
|
||||
OC_MV(OC_MV_X(_mv1)-OC_MV_X(_mv2), \
|
||||
OC_MV_Y(_mv1)-OC_MV_Y(_mv2))
|
||||
|
||||
|
||||
|
||||
/*Super blocks are 32x32 segments of pixels in a single color plane indexed
|
||||
in image order.
|
||||
Internally, super blocks are broken up into four quadrants, each of which
|
||||
contains a 2x2 pattern of blocks, each of which is an 8x8 block of pixels.
|
||||
Quadrants, and the blocks within them, are indexed in a special order called
|
||||
a "Hilbert curve" within the super block.
|
||||
|
||||
In order to differentiate between the Hilbert-curve indexing strategy and
|
||||
the regular image order indexing strategy, blocks indexed in image order
|
||||
are called "fragments".
|
||||
Fragments are indexed in image order, left to right, then bottom to top,
|
||||
from Y' plane to Cb plane to Cr plane.
|
||||
|
||||
The co-located fragments in all image planes corresponding to the location
|
||||
of a single quadrant of a luma plane super block form a macro block.
|
||||
Thus there is only a single set of macro blocks for all planes, each of which
|
||||
contains between 6 and 12 fragments, depending on the pixel format.
|
||||
Therefore macro block information is kept in a separate set of arrays from
|
||||
super blocks to avoid unused space in the other planes.
|
||||
The lists are indexed in super block order.
|
||||
That is, the macro block corresponding to the macro block mbi in (luma plane)
|
||||
super block sbi is at index (sbi<<2|mbi).
|
||||
Thus the number of macro blocks in each dimension is always twice the number
|
||||
of super blocks, even when only an odd number fall inside the coded frame.
|
||||
These "extra" macro blocks are just an artifact of our internal data layout,
|
||||
and not part of the coded stream; they are flagged with a negative MB mode.*/
|
||||
|
||||
|
||||
|
||||
/*Super block information.*/
|
||||
struct oc_sb_flags{
|
||||
unsigned char coded_fully:1;
|
||||
unsigned char coded_partially:1;
|
||||
unsigned char quad_valid:4;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Information about a fragment which intersects the border of the displayable
|
||||
region.
|
||||
This marks which pixels belong to the displayable region.*/
|
||||
struct oc_border_info{
|
||||
/*A bit mask marking which pixels are in the displayable region.
|
||||
Pixel (x,y) corresponds to bit (y<<3|x).*/
|
||||
ogg_int64_t mask;
|
||||
/*The number of pixels in the displayable region.
|
||||
This is always positive, and always less than 64.*/
|
||||
int npixels;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Fragment information.*/
|
||||
struct oc_fragment{
|
||||
/*A flag indicating whether or not this fragment is coded.*/
|
||||
unsigned coded:1;
|
||||
/*A flag indicating that this entire fragment lies outside the displayable
|
||||
region of the frame.
|
||||
Note the contrast with an invalid macro block, which is outside the coded
|
||||
frame, not just the displayable one.
|
||||
There are no fragments outside the coded frame by construction.*/
|
||||
unsigned invalid:1;
|
||||
/*The index of the quality index used for this fragment's AC coefficients.*/
|
||||
unsigned qii:4;
|
||||
/*The index of the reference frame this fragment is predicted from.*/
|
||||
unsigned refi:2;
|
||||
/*The mode of the macroblock this fragment belongs to.*/
|
||||
unsigned mb_mode:3;
|
||||
/*The index of the associated border information for fragments which lie
|
||||
partially outside the displayable region.
|
||||
For fragments completely inside or outside this region, this is -1.
|
||||
Note that the C standard requires an explicit signed keyword for bitfield
|
||||
types, since some compilers may treat them as unsigned without it.*/
|
||||
signed int borderi:5;
|
||||
/*The prediction-corrected DC component.
|
||||
Note that the C standard requires an explicit signed keyword for bitfield
|
||||
types, since some compilers may treat them as unsigned without it.*/
|
||||
signed int dc:16;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*A description of each fragment plane.*/
|
||||
struct oc_fragment_plane{
|
||||
/*The number of fragments in the horizontal direction.*/
|
||||
int nhfrags;
|
||||
/*The number of fragments in the vertical direction.*/
|
||||
int nvfrags;
|
||||
/*The offset of the first fragment in the plane.*/
|
||||
ptrdiff_t froffset;
|
||||
/*The total number of fragments in the plane.*/
|
||||
ptrdiff_t nfrags;
|
||||
/*The number of super blocks in the horizontal direction.*/
|
||||
unsigned nhsbs;
|
||||
/*The number of super blocks in the vertical direction.*/
|
||||
unsigned nvsbs;
|
||||
/*The offset of the first super block in the plane.*/
|
||||
unsigned sboffset;
|
||||
/*The total number of super blocks in the plane.*/
|
||||
unsigned nsbs;
|
||||
};
|
||||
|
||||
|
||||
typedef void (*oc_state_loop_filter_frag_rows_func)(
|
||||
const oc_theora_state *_state,signed char _bv[256],int _refi,int _pli,
|
||||
int _fragy0,int _fragy_end);
|
||||
|
||||
/*The shared (encoder and decoder) functions that have accelerated variants.*/
|
||||
struct oc_base_opt_vtable{
|
||||
void (*frag_copy)(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride);
|
||||
void (*frag_copy_list)(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||
void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t _residue[64]);
|
||||
void (*frag_recon_inter)(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void (*idct8x8)(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void (*loop_filter_init)(signed char _bv[256],int _flimit);
|
||||
oc_state_loop_filter_frag_rows_func state_loop_filter_frag_rows;
|
||||
void (*restore_fpu)(void);
|
||||
};
|
||||
|
||||
/*The shared (encoder and decoder) tables that vary according to which variants
|
||||
of the above functions are used.*/
|
||||
struct oc_base_opt_data{
|
||||
const unsigned char *dct_fzig_zag;
|
||||
};
|
||||
|
||||
|
||||
/*State information common to both the encoder and decoder.*/
|
||||
struct oc_theora_state{
|
||||
/*The stream information.*/
|
||||
th_info info;
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
/*Table for shared accelerated functions.*/
|
||||
oc_base_opt_vtable opt_vtable;
|
||||
# endif
|
||||
/*Table for shared data used by accelerated functions.*/
|
||||
oc_base_opt_data opt_data;
|
||||
/*CPU flags to detect the presence of extended instruction sets.*/
|
||||
ogg_uint32_t cpu_flags;
|
||||
/*The fragment plane descriptions.*/
|
||||
oc_fragment_plane fplanes[3];
|
||||
/*The list of fragments, indexed in image order.*/
|
||||
oc_fragment *frags;
|
||||
/*The the offset into the reference frame buffer to the upper-left pixel of
|
||||
each fragment.*/
|
||||
ptrdiff_t *frag_buf_offs;
|
||||
/*The motion vector for each fragment.*/
|
||||
oc_mv *frag_mvs;
|
||||
/*The total number of fragments in a single frame.*/
|
||||
ptrdiff_t nfrags;
|
||||
/*The list of super block maps, indexed in image order.*/
|
||||
oc_sb_map *sb_maps;
|
||||
/*The list of super block flags, indexed in image order.*/
|
||||
oc_sb_flags *sb_flags;
|
||||
/*The total number of super blocks in a single frame.*/
|
||||
unsigned nsbs;
|
||||
/*The fragments from each color plane that belong to each macro block.
|
||||
Fragments are stored in image order (left to right then top to bottom).
|
||||
When chroma components are decimated, the extra fragments have an index of
|
||||
-1.*/
|
||||
oc_mb_map *mb_maps;
|
||||
/*The list of macro block modes.
|
||||
A negative number indicates the macro block lies entirely outside the
|
||||
coded frame.*/
|
||||
signed char *mb_modes;
|
||||
/*The number of macro blocks in the X direction.*/
|
||||
unsigned nhmbs;
|
||||
/*The number of macro blocks in the Y direction.*/
|
||||
unsigned nvmbs;
|
||||
/*The total number of macro blocks.*/
|
||||
size_t nmbs;
|
||||
/*The list of coded fragments, in coded order.
|
||||
Uncoded fragments are stored in reverse order from the end of the list.*/
|
||||
ptrdiff_t *coded_fragis;
|
||||
/*The number of coded fragments in each plane.*/
|
||||
ptrdiff_t ncoded_fragis[3];
|
||||
/*The total number of coded fragments.*/
|
||||
ptrdiff_t ntotal_coded_fragis;
|
||||
/*The actual buffers used for the reference frames.*/
|
||||
th_ycbcr_buffer ref_frame_bufs[6];
|
||||
/*The index of the buffers being used for each OC_FRAME_* reference frame.*/
|
||||
int ref_frame_idx[6];
|
||||
/*The storage for the reference frame buffers.
|
||||
This is just ref_frame_bufs[ref_frame_idx[i]][0].data, but is cached here
|
||||
for faster look-up.*/
|
||||
unsigned char *ref_frame_data[6];
|
||||
/*The handle used to allocate the reference frame buffers.*/
|
||||
unsigned char *ref_frame_handle;
|
||||
/*The strides for each plane in the reference frames.*/
|
||||
int ref_ystride[3];
|
||||
/*The number of unique border patterns.*/
|
||||
int nborders;
|
||||
/*The unique border patterns for all border fragments.
|
||||
The borderi field of fragments which straddle the border indexes this
|
||||
list.*/
|
||||
oc_border_info borders[16];
|
||||
/*The frame number of the last keyframe.*/
|
||||
ogg_int64_t keyframe_num;
|
||||
/*The frame number of the current frame.*/
|
||||
ogg_int64_t curframe_num;
|
||||
/*The granpos of the current frame.*/
|
||||
ogg_int64_t granpos;
|
||||
/*The type of the current frame.*/
|
||||
signed char frame_type;
|
||||
/*The bias to add to the frame count when computing granule positions.*/
|
||||
unsigned char granpos_bias;
|
||||
/*The number of quality indices used in the current frame.*/
|
||||
unsigned char nqis;
|
||||
/*The quality indices of the current frame.*/
|
||||
unsigned char qis[3];
|
||||
/*The dequantization tables, stored in zig-zag order, and indexed by
|
||||
qi, pli, qti, and zzi.*/
|
||||
ogg_uint16_t *dequant_tables[64][3][2];
|
||||
OC_ALIGN16(oc_quant_table dequant_table_data[64][3][2]);
|
||||
/*Loop filter strength parameters.*/
|
||||
unsigned char loop_filter_limits[64];
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*The function type used to fill in the chroma plane motion vectors for a
|
||||
macro block when 4 different motion vectors are specified in the luma
|
||||
plane.
|
||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||
_lmbmv: The luma macro-block level motion vector to fill in for use in
|
||||
prediction.
|
||||
_lbmvs: The luma block-level motion vectors.*/
|
||||
typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]);
|
||||
|
||||
|
||||
|
||||
/*A table of functions used to fill in the Cb,Cr plane motion vectors for a
|
||||
macro block when 4 different motion vectors are specified in the luma
|
||||
plane.*/
|
||||
extern const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS];
|
||||
|
||||
|
||||
|
||||
int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
|
||||
void oc_state_clear(oc_theora_state *_state);
|
||||
void oc_state_accel_init_c(oc_theora_state *_state);
|
||||
void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli,
|
||||
int _y0,int _yend);
|
||||
void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli);
|
||||
void oc_state_borders_fill(oc_theora_state *_state,int _refi);
|
||||
void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx,
|
||||
th_ycbcr_buffer _img);
|
||||
int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
|
||||
int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
||||
int _pli,oc_mv _mv);
|
||||
|
||||
void oc_loop_filter_init_c(signed char _bv[256],int _flimit);
|
||||
void oc_state_loop_filter(oc_theora_state *_state,int _frame);
|
||||
# if defined(OC_DUMP_IMAGES)
|
||||
int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
|
||||
const char *_suf);
|
||||
# endif
|
||||
|
||||
/*Default pure-C implementations of shared accelerated functions.*/
|
||||
void oc_frag_copy_c(unsigned char *_dst,
|
||||
const unsigned char *_src,int _src_ystride);
|
||||
void oc_frag_copy_list_c(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||
void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
|
||||
const ogg_int16_t _residue[64]);
|
||||
void oc_frag_recon_inter_c(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
void oc_restore_fpu_c(void);
|
||||
|
||||
/*We need a way to call a few encoder functions without introducing a link-time
|
||||
dependency into the decoder, while still allowing the old alpha API which
|
||||
does not distinguish between encoder and decoder objects to be used.
|
||||
We do this by placing a function table at the start of the encoder object
|
||||
which can dispatch into the encoder library.
|
||||
We do a similar thing for the decoder in case we ever decide to split off a
|
||||
common base library.*/
|
||||
typedef void (*oc_state_clear_func)(theora_state *_th);
|
||||
typedef int (*oc_state_control_func)(theora_state *th,int _req,
|
||||
void *_buf,size_t _buf_sz);
|
||||
typedef ogg_int64_t (*oc_state_granule_frame_func)(theora_state *_th,
|
||||
ogg_int64_t _granulepos);
|
||||
typedef double (*oc_state_granule_time_func)(theora_state *_th,
|
||||
ogg_int64_t _granulepos);
|
||||
|
||||
|
||||
struct oc_state_dispatch_vtable{
|
||||
oc_state_clear_func clear;
|
||||
oc_state_control_func control;
|
||||
oc_state_granule_frame_func granule_frame;
|
||||
oc_state_granule_time_func granule_time;
|
||||
};
|
||||
|
||||
#endif
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: mmxfrag.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: mmxfrag.c 17410 2010-09-21 21:53:48Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -22,10 +22,64 @@
|
||||
The iteration each instruction belongs to is marked in the comments as #i.*/
|
||||
#include <stddef.h>
|
||||
#include "x86int.h"
|
||||
#include "mmxfrag.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||
between rows.*/
|
||||
# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
|
||||
do{ \
|
||||
const unsigned char *src; \
|
||||
unsigned char *dst; \
|
||||
ptrdiff_t ystride3; \
|
||||
src=(_src); \
|
||||
dst=(_dst); \
|
||||
__asm__ __volatile__( \
|
||||
/*src+0*ystride*/ \
|
||||
"movq (%[src]),%%mm0\n\t" \
|
||||
/*src+1*ystride*/ \
|
||||
"movq (%[src],%[ystride]),%%mm1\n\t" \
|
||||
/*ystride3=ystride*3*/ \
|
||||
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
|
||||
/*src+2*ystride*/ \
|
||||
"movq (%[src],%[ystride],2),%%mm2\n\t" \
|
||||
/*src+3*ystride*/ \
|
||||
"movq (%[src],%[ystride3]),%%mm3\n\t" \
|
||||
/*dst+0*ystride*/ \
|
||||
"movq %%mm0,(%[dst])\n\t" \
|
||||
/*dst+1*ystride*/ \
|
||||
"movq %%mm1,(%[dst],%[ystride])\n\t" \
|
||||
/*Pointer to next 4.*/ \
|
||||
"lea (%[src],%[ystride],4),%[src]\n\t" \
|
||||
/*dst+2*ystride*/ \
|
||||
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
|
||||
/*dst+3*ystride*/ \
|
||||
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
|
||||
/*Pointer to next 4.*/ \
|
||||
"lea (%[dst],%[ystride],4),%[dst]\n\t" \
|
||||
/*src+0*ystride*/ \
|
||||
"movq (%[src]),%%mm0\n\t" \
|
||||
/*src+1*ystride*/ \
|
||||
"movq (%[src],%[ystride]),%%mm1\n\t" \
|
||||
/*src+2*ystride*/ \
|
||||
"movq (%[src],%[ystride],2),%%mm2\n\t" \
|
||||
/*src+3*ystride*/ \
|
||||
"movq (%[src],%[ystride3]),%%mm3\n\t" \
|
||||
/*dst+0*ystride*/ \
|
||||
"movq %%mm0,(%[dst])\n\t" \
|
||||
/*dst+1*ystride*/ \
|
||||
"movq %%mm1,(%[dst],%[ystride])\n\t" \
|
||||
/*dst+2*ystride*/ \
|
||||
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
|
||||
/*dst+3*ystride*/ \
|
||||
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
|
||||
:[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
|
||||
:[ystride]"r"((ptrdiff_t)(_ystride)) \
|
||||
:"memory" \
|
||||
); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||
between rows.*/
|
||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||
@ -33,6 +87,27 @@ void oc_frag_copy_mmx(unsigned char *_dst,
|
||||
OC_FRAG_COPY_MMX(_dst,_src,_ystride);
|
||||
}
|
||||
|
||||
/*Copies the fragments specified by the lists of fragment indices from one
|
||||
frame to another.
|
||||
_dst_frame: The reference frame to copy to.
|
||||
_src_frame: The reference frame to copy from.
|
||||
_ystride: The row stride of the reference frames.
|
||||
_fragis: A pointer to a list of fragment indices.
|
||||
_nfragis: The number of fragment indices to copy.
|
||||
_frag_buf_offs: The offsets of fragments in the reference frames.*/
|
||||
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
|
||||
ptrdiff_t fragii;
|
||||
for(fragii=0;fragii<_nfragis;fragii++){
|
||||
ptrdiff_t frag_buf_off;
|
||||
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
|
||||
OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
|
||||
_src_frame+frag_buf_off,_ystride);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t *_residue){
|
||||
__asm__ __volatile__(
|
||||
|
@ -1,64 +0,0 @@
|
||||
#if !defined(_x86_mmxfrag_H)
|
||||
# define _x86_mmxfrag_H (1)
|
||||
# include <stddef.h>
|
||||
# include "x86int.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||
between rows.*/
|
||||
#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
|
||||
do{ \
|
||||
const unsigned char *src; \
|
||||
unsigned char *dst; \
|
||||
ptrdiff_t ystride3; \
|
||||
src=(_src); \
|
||||
dst=(_dst); \
|
||||
__asm__ __volatile__( \
|
||||
/*src+0*ystride*/ \
|
||||
"movq (%[src]),%%mm0\n\t" \
|
||||
/*src+1*ystride*/ \
|
||||
"movq (%[src],%[ystride]),%%mm1\n\t" \
|
||||
/*ystride3=ystride*3*/ \
|
||||
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
|
||||
/*src+2*ystride*/ \
|
||||
"movq (%[src],%[ystride],2),%%mm2\n\t" \
|
||||
/*src+3*ystride*/ \
|
||||
"movq (%[src],%[ystride3]),%%mm3\n\t" \
|
||||
/*dst+0*ystride*/ \
|
||||
"movq %%mm0,(%[dst])\n\t" \
|
||||
/*dst+1*ystride*/ \
|
||||
"movq %%mm1,(%[dst],%[ystride])\n\t" \
|
||||
/*Pointer to next 4.*/ \
|
||||
"lea (%[src],%[ystride],4),%[src]\n\t" \
|
||||
/*dst+2*ystride*/ \
|
||||
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
|
||||
/*dst+3*ystride*/ \
|
||||
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
|
||||
/*Pointer to next 4.*/ \
|
||||
"lea (%[dst],%[ystride],4),%[dst]\n\t" \
|
||||
/*src+0*ystride*/ \
|
||||
"movq (%[src]),%%mm0\n\t" \
|
||||
/*src+1*ystride*/ \
|
||||
"movq (%[src],%[ystride]),%%mm1\n\t" \
|
||||
/*src+2*ystride*/ \
|
||||
"movq (%[src],%[ystride],2),%%mm2\n\t" \
|
||||
/*src+3*ystride*/ \
|
||||
"movq (%[src],%[ystride3]),%%mm3\n\t" \
|
||||
/*dst+0*ystride*/ \
|
||||
"movq %%mm0,(%[dst])\n\t" \
|
||||
/*dst+1*ystride*/ \
|
||||
"movq %%mm1,(%[dst],%[ystride])\n\t" \
|
||||
/*dst+2*ystride*/ \
|
||||
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
|
||||
/*dst+3*ystride*/ \
|
||||
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
|
||||
:[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
|
||||
:[ystride]"r"((ptrdiff_t)(_ystride)) \
|
||||
:"memory" \
|
||||
); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
# endif
|
||||
#endif
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -30,89 +30,66 @@
|
||||
|
||||
|
||||
|
||||
/*A table of constants used by the MMX routines.*/
|
||||
static const ogg_uint16_t __attribute__((aligned(8),used))
|
||||
OC_IDCT_CONSTS[(7+1)*4]={
|
||||
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
|
||||
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
|
||||
(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
|
||||
(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
|
||||
(ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
|
||||
(ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
|
||||
(ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
|
||||
(ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
|
||||
(ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
|
||||
(ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
|
||||
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
|
||||
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
|
||||
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
|
||||
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
|
||||
8, 8, 8, 8
|
||||
};
|
||||
|
||||
/*Converts the expression in the argument to a string.*/
|
||||
#define OC_M2STR(_s) #_s
|
||||
|
||||
/*38 cycles*/
|
||||
#define OC_IDCT_BEGIN \
|
||||
#define OC_IDCT_BEGIN(_y,_x) \
|
||||
"#OC_IDCT_BEGIN\n\t" \
|
||||
"movq "OC_I(3)",%%mm2\n\t" \
|
||||
"movq "OC_C(3)",%%mm6\n\t" \
|
||||
"movq "OC_I(3,_x)",%%mm2\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"movq "OC_J(5)",%%mm7\n\t" \
|
||||
"movq "OC_J(5,_x)",%%mm7\n\t" \
|
||||
"pmulhw %%mm6,%%mm4\n\t" \
|
||||
"movq "OC_C(5)",%%mm1\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
|
||||
"pmulhw %%mm7,%%mm6\n\t" \
|
||||
"movq %%mm1,%%mm5\n\t" \
|
||||
"pmulhw %%mm2,%%mm1\n\t" \
|
||||
"movq "OC_I(1)",%%mm3\n\t" \
|
||||
"movq "OC_I(1,_x)",%%mm3\n\t" \
|
||||
"pmulhw %%mm7,%%mm5\n\t" \
|
||||
"movq "OC_C(1)",%%mm0\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
|
||||
"paddw %%mm2,%%mm4\n\t" \
|
||||
"paddw %%mm7,%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
"movq "OC_J(7)",%%mm1\n\t" \
|
||||
"movq "OC_J(7,_x)",%%mm1\n\t" \
|
||||
"paddw %%mm5,%%mm7\n\t" \
|
||||
"movq %%mm0,%%mm5\n\t" \
|
||||
"pmulhw %%mm3,%%mm0\n\t" \
|
||||
"paddw %%mm7,%%mm4\n\t" \
|
||||
"pmulhw %%mm1,%%mm5\n\t" \
|
||||
"movq "OC_C(7)",%%mm7\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
|
||||
"psubw %%mm2,%%mm6\n\t" \
|
||||
"paddw %%mm3,%%mm0\n\t" \
|
||||
"pmulhw %%mm7,%%mm3\n\t" \
|
||||
"movq "OC_I(2)",%%mm2\n\t" \
|
||||
"movq "OC_I(2,_x)",%%mm2\n\t" \
|
||||
"pmulhw %%mm1,%%mm7\n\t" \
|
||||
"paddw %%mm1,%%mm5\n\t" \
|
||||
"movq %%mm2,%%mm1\n\t" \
|
||||
"pmulhw "OC_C(2)",%%mm2\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
|
||||
"psubw %%mm5,%%mm3\n\t" \
|
||||
"movq "OC_J(6)",%%mm5\n\t" \
|
||||
"movq "OC_J(6,_x)",%%mm5\n\t" \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"movq %%mm5,%%mm7\n\t" \
|
||||
"psubw %%mm4,%%mm0\n\t" \
|
||||
"pmulhw "OC_C(2)",%%mm5\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
"pmulhw "OC_C(6)",%%mm1\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
|
||||
"paddw %%mm4,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"psubw %%mm6,%%mm3\n\t" \
|
||||
"paddw %%mm7,%%mm5\n\t" \
|
||||
"paddw %%mm6,%%mm6\n\t" \
|
||||
"pmulhw "OC_C(6)",%%mm7\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
|
||||
"paddw %%mm3,%%mm6\n\t" \
|
||||
"movq %%mm4,"OC_I(1)"\n\t" \
|
||||
"movq %%mm4,"OC_I(1,_y)"\n\t" \
|
||||
"psubw %%mm5,%%mm1\n\t" \
|
||||
"movq "OC_C(4)",%%mm4\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
|
||||
"movq %%mm3,%%mm5\n\t" \
|
||||
"pmulhw %%mm4,%%mm3\n\t" \
|
||||
"paddw %%mm2,%%mm7\n\t" \
|
||||
"movq %%mm6,"OC_I(2)"\n\t" \
|
||||
"movq %%mm6,"OC_I(2,_y)"\n\t" \
|
||||
"movq %%mm0,%%mm2\n\t" \
|
||||
"movq "OC_I(0)",%%mm6\n\t" \
|
||||
"movq "OC_I(0,_x)",%%mm6\n\t" \
|
||||
"pmulhw %%mm4,%%mm0\n\t" \
|
||||
"paddw %%mm3,%%mm5\n\t" \
|
||||
"movq "OC_J(4)",%%mm3\n\t" \
|
||||
"movq "OC_J(4,_x)",%%mm3\n\t" \
|
||||
"psubw %%mm1,%%mm5\n\t" \
|
||||
"paddw %%mm0,%%mm2\n\t" \
|
||||
"psubw %%mm3,%%mm6\n\t" \
|
||||
@ -126,18 +103,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
||||
"paddw %%mm0,%%mm6\n\t" \
|
||||
"psubw %%mm2,%%mm6\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"movq "OC_I(1)",%%mm0\n\t" \
|
||||
"movq "OC_I(1,_y)",%%mm0\n\t" \
|
||||
"paddw %%mm6,%%mm2\n\t" \
|
||||
"paddw %%mm3,%%mm4\n\t" \
|
||||
"psubw %%mm1,%%mm2\n\t" \
|
||||
"#end OC_IDCT_BEGIN\n\t" \
|
||||
|
||||
/*38+8=46 cycles.*/
|
||||
#define OC_ROW_IDCT \
|
||||
#define OC_ROW_IDCT(_y,_x) \
|
||||
"#OC_ROW_IDCT\n" \
|
||||
OC_IDCT_BEGIN \
|
||||
OC_IDCT_BEGIN(_y,_x) \
|
||||
/*r3=D'*/ \
|
||||
"movq "OC_I(2)",%%mm3\n\t" \
|
||||
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||
/*r4=E'=E-G*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
/*r1=H'+H'*/ \
|
||||
@ -162,7 +139,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
/*Save R1.*/ \
|
||||
"movq %%mm1,"OC_I(1)"\n\t" \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
/*r0=R0=G.+C.*/ \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"#end OC_ROW_IDCT\n\t" \
|
||||
@ -195,11 +172,11 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
||||
|
||||
Since r1 is free at entry, we calculate the Js first.*/
|
||||
/*19 cycles.*/
|
||||
#define OC_TRANSPOSE \
|
||||
#define OC_TRANSPOSE(_y) \
|
||||
"#OC_TRANSPOSE\n\t" \
|
||||
"movq %%mm4,%%mm1\n\t" \
|
||||
"punpcklwd %%mm5,%%mm4\n\t" \
|
||||
"movq %%mm0,"OC_I(0)"\n\t" \
|
||||
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||
"punpckhwd %%mm5,%%mm1\n\t" \
|
||||
"movq %%mm6,%%mm0\n\t" \
|
||||
"punpcklwd %%mm7,%%mm6\n\t" \
|
||||
@ -207,17 +184,17 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
||||
"punpckldq %%mm6,%%mm4\n\t" \
|
||||
"punpckhdq %%mm6,%%mm5\n\t" \
|
||||
"movq %%mm1,%%mm6\n\t" \
|
||||
"movq %%mm4,"OC_J(4)"\n\t" \
|
||||
"movq %%mm4,"OC_J(4,_y)"\n\t" \
|
||||
"punpckhwd %%mm7,%%mm0\n\t" \
|
||||
"movq %%mm5,"OC_J(5)"\n\t" \
|
||||
"movq %%mm5,"OC_J(5,_y)"\n\t" \
|
||||
"punpckhdq %%mm0,%%mm6\n\t" \
|
||||
"movq "OC_I(0)",%%mm4\n\t" \
|
||||
"movq "OC_I(0,_y)",%%mm4\n\t" \
|
||||
"punpckldq %%mm0,%%mm1\n\t" \
|
||||
"movq "OC_I(1)",%%mm5\n\t" \
|
||||
"movq "OC_I(1,_y)",%%mm5\n\t" \
|
||||
"movq %%mm4,%%mm0\n\t" \
|
||||
"movq %%mm6,"OC_J(7)"\n\t" \
|
||||
"movq %%mm6,"OC_J(7,_y)"\n\t" \
|
||||
"punpcklwd %%mm5,%%mm0\n\t" \
|
||||
"movq %%mm1,"OC_J(6)"\n\t" \
|
||||
"movq %%mm1,"OC_J(6,_y)"\n\t" \
|
||||
"punpckhwd %%mm5,%%mm4\n\t" \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklwd %%mm3,%%mm2\n\t" \
|
||||
@ -225,20 +202,20 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
||||
"punpckldq %%mm2,%%mm0\n\t" \
|
||||
"punpckhdq %%mm2,%%mm1\n\t" \
|
||||
"movq %%mm4,%%mm2\n\t" \
|
||||
"movq %%mm0,"OC_I(0)"\n\t" \
|
||||
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||
"punpckhwd %%mm3,%%mm5\n\t" \
|
||||
"movq %%mm1,"OC_I(1)"\n\t" \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
"punpckhdq %%mm5,%%mm4\n\t" \
|
||||
"punpckldq %%mm5,%%mm2\n\t" \
|
||||
"movq %%mm4,"OC_I(3)"\n\t" \
|
||||
"movq %%mm2,"OC_I(2)"\n\t" \
|
||||
"movq %%mm4,"OC_I(3,_y)"\n\t" \
|
||||
"movq %%mm2,"OC_I(2,_y)"\n\t" \
|
||||
"#end OC_TRANSPOSE\n\t" \
|
||||
|
||||
/*38+19=57 cycles.*/
|
||||
#define OC_COLUMN_IDCT \
|
||||
#define OC_COLUMN_IDCT(_y) \
|
||||
"#OC_COLUMN_IDCT\n" \
|
||||
OC_IDCT_BEGIN \
|
||||
"paddw "OC_8",%%mm2\n\t" \
|
||||
OC_IDCT_BEGIN(_y,_y) \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
|
||||
/*r1=H'+H'*/ \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
@ -250,18 +227,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
||||
/*r1=NR1*/ \
|
||||
"psraw $4,%%mm1\n\t" \
|
||||
/*r3=D'*/ \
|
||||
"movq "OC_I(2)",%%mm3\n\t" \
|
||||
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||
/*r7=G+G*/ \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
/*Store NR2 at I(2).*/ \
|
||||
"movq %%mm2,"OC_I(2)"\n\t" \
|
||||
"movq %%mm2,"OC_I(2,_y)"\n\t" \
|
||||
/*r7=G'=E+G*/ \
|
||||
"paddw %%mm4,%%mm7\n\t" \
|
||||
/*Store NR1 at I(1).*/ \
|
||||
"movq %%mm1,"OC_I(1)"\n\t" \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
"psubw %%mm3,%%mm4\n\t" \
|
||||
"paddw "OC_8",%%mm4\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
|
||||
/*r3=D'+D'*/ \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
@ -272,7 +249,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
/*r3=NR3*/ \
|
||||
"psraw $4,%%mm3\n\t" \
|
||||
"paddw "OC_8",%%mm6\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
|
||||
/*r5=B''+B''*/ \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
@ -280,14 +257,14 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
||||
/*r6=NR6*/ \
|
||||
"psraw $4,%%mm6\n\t" \
|
||||
/*Store NR4 at J(4).*/ \
|
||||
"movq %%mm4,"OC_J(4)"\n\t" \
|
||||
"movq %%mm4,"OC_J(4,_y)"\n\t" \
|
||||
/*r5=NR5*/ \
|
||||
"psraw $4,%%mm5\n\t" \
|
||||
/*Store NR3 at I(3).*/ \
|
||||
"movq %%mm3,"OC_I(3)"\n\t" \
|
||||
"movq %%mm3,"OC_I(3,_y)"\n\t" \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"paddw "OC_8",%%mm7\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
|
||||
/*r0=C'+C'*/ \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
/*r0=R0=G'+C'*/ \
|
||||
@ -295,113 +272,123 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
||||
/*r7=NR7*/ \
|
||||
"psraw $4,%%mm7\n\t" \
|
||||
/*Store NR6 at J(6).*/ \
|
||||
"movq %%mm6,"OC_J(6)"\n\t" \
|
||||
"movq %%mm6,"OC_J(6,_y)"\n\t" \
|
||||
/*r0=NR0*/ \
|
||||
"psraw $4,%%mm0\n\t" \
|
||||
/*Store NR5 at J(5).*/ \
|
||||
"movq %%mm5,"OC_J(5)"\n\t" \
|
||||
"movq %%mm5,"OC_J(5,_y)"\n\t" \
|
||||
/*Store NR7 at J(7).*/ \
|
||||
"movq %%mm7,"OC_J(7)"\n\t" \
|
||||
"movq %%mm7,"OC_J(7,_y)"\n\t" \
|
||||
/*Store NR0 at I(0).*/ \
|
||||
"movq %%mm0,"OC_I(0)"\n\t" \
|
||||
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||
"#end OC_COLUMN_IDCT\n\t" \
|
||||
|
||||
#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])"
|
||||
#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1)
|
||||
#define OC_8 OC_MID(OC_EIGHT_OFFSET,0)
|
||||
|
||||
static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
||||
static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
/*This routine accepts an 8x8 matrix, but in partially transposed form.
|
||||
Every 4x4 block is transposed.*/
|
||||
__asm__ __volatile__(
|
||||
#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
|
||||
#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
|
||||
OC_ROW_IDCT
|
||||
OC_TRANSPOSE
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
|
||||
OC_ROW_IDCT(y,x)
|
||||
OC_TRANSPOSE(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k) OC_M2STR((_k*16)+64)"(%[y])"
|
||||
#define OC_J(_k) OC_M2STR(((_k-4)*16)+72)"(%[y])"
|
||||
OC_ROW_IDCT
|
||||
OC_TRANSPOSE
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y)
|
||||
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y)
|
||||
OC_ROW_IDCT(y,x)
|
||||
OC_TRANSPOSE(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
|
||||
#define OC_J(_k) OC_I(_k)
|
||||
OC_COLUMN_IDCT
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
|
||||
#define OC_J(_k) OC_I(_k)
|
||||
OC_COLUMN_IDCT
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
:
|
||||
:[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
|
||||
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
|
||||
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
|
||||
[c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
|
||||
);
|
||||
if(_x!=_y){
|
||||
int i;
|
||||
__asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
|
||||
for(i=0;i<4;i++){
|
||||
__asm__ __volatile__(
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
|
||||
:[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*25 cycles.*/
|
||||
#define OC_IDCT_BEGIN_10 \
|
||||
#define OC_IDCT_BEGIN_10(_y,_x) \
|
||||
"#OC_IDCT_BEGIN_10\n\t" \
|
||||
"movq "OC_I(3)",%%mm2\n\t" \
|
||||
"movq "OC_I(3,_x)",%%mm2\n\t" \
|
||||
"nop\n\t" \
|
||||
"movq "OC_C(3)",%%mm6\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"movq "OC_C(5)",%%mm1\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
|
||||
"pmulhw %%mm6,%%mm4\n\t" \
|
||||
"movq "OC_I(1)",%%mm3\n\t" \
|
||||
"movq "OC_I(1,_x)",%%mm3\n\t" \
|
||||
"pmulhw %%mm2,%%mm1\n\t" \
|
||||
"movq "OC_C(1)",%%mm0\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
|
||||
"paddw %%mm2,%%mm4\n\t" \
|
||||
"pxor %%mm6,%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
"movq "OC_I(2)",%%mm5\n\t" \
|
||||
"movq "OC_I(2,_x)",%%mm5\n\t" \
|
||||
"pmulhw %%mm3,%%mm0\n\t" \
|
||||
"movq %%mm5,%%mm1\n\t" \
|
||||
"paddw %%mm3,%%mm0\n\t" \
|
||||
"pmulhw "OC_C(7)",%%mm3\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
|
||||
"psubw %%mm2,%%mm6\n\t" \
|
||||
"pmulhw "OC_C(2)",%%mm5\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
|
||||
"psubw %%mm4,%%mm0\n\t" \
|
||||
"movq "OC_I(2)",%%mm7\n\t" \
|
||||
"movq "OC_I(2,_x)",%%mm7\n\t" \
|
||||
"paddw %%mm4,%%mm4\n\t" \
|
||||
"paddw %%mm5,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"pmulhw "OC_C(6)",%%mm1\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
|
||||
"psubw %%mm6,%%mm3\n\t" \
|
||||
"movq %%mm4,"OC_I(1)"\n\t" \
|
||||
"movq %%mm4,"OC_I(1,_y)"\n\t" \
|
||||
"paddw %%mm6,%%mm6\n\t" \
|
||||
"movq "OC_C(4)",%%mm4\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
|
||||
"paddw %%mm3,%%mm6\n\t" \
|
||||
"movq %%mm3,%%mm5\n\t" \
|
||||
"pmulhw %%mm4,%%mm3\n\t" \
|
||||
"movq %%mm6,"OC_I(2)"\n\t" \
|
||||
"movq %%mm6,"OC_I(2,_y)"\n\t" \
|
||||
"movq %%mm0,%%mm2\n\t" \
|
||||
"movq "OC_I(0)",%%mm6\n\t" \
|
||||
"movq "OC_I(0,_x)",%%mm6\n\t" \
|
||||
"pmulhw %%mm4,%%mm0\n\t" \
|
||||
"paddw %%mm3,%%mm5\n\t" \
|
||||
"paddw %%mm0,%%mm2\n\t" \
|
||||
"psubw %%mm1,%%mm5\n\t" \
|
||||
"pmulhw %%mm4,%%mm6\n\t" \
|
||||
"paddw "OC_I(0)",%%mm6\n\t" \
|
||||
"paddw "OC_I(0,_x)",%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
"movq %%mm6,%%mm4\n\t" \
|
||||
"paddw %%mm5,%%mm1\n\t" \
|
||||
"psubw %%mm2,%%mm6\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"movq "OC_I(1)",%%mm0\n\t" \
|
||||
"movq "OC_I(1,_y)",%%mm0\n\t" \
|
||||
"paddw %%mm6,%%mm2\n\t" \
|
||||
"psubw %%mm1,%%mm2\n\t" \
|
||||
"nop\n\t" \
|
||||
"#end OC_IDCT_BEGIN_10\n\t" \
|
||||
|
||||
/*25+8=33 cycles.*/
|
||||
#define OC_ROW_IDCT_10 \
|
||||
#define OC_ROW_IDCT_10(_y,_x) \
|
||||
"#OC_ROW_IDCT_10\n\t" \
|
||||
OC_IDCT_BEGIN_10 \
|
||||
OC_IDCT_BEGIN_10(_y,_x) \
|
||||
/*r3=D'*/ \
|
||||
"movq "OC_I(2)",%%mm3\n\t" \
|
||||
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||
/*r4=E'=E-G*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
/*r1=H'+H'*/ \
|
||||
@ -426,16 +413,16 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
/*Save R1.*/ \
|
||||
"movq %%mm1,"OC_I(1)"\n\t" \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
/*r0=R0=G'+C'*/ \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"#end OC_ROW_IDCT_10\n\t" \
|
||||
|
||||
/*25+19=44 cycles'*/
|
||||
#define OC_COLUMN_IDCT_10 \
|
||||
#define OC_COLUMN_IDCT_10(_y) \
|
||||
"#OC_COLUMN_IDCT_10\n\t" \
|
||||
OC_IDCT_BEGIN_10 \
|
||||
"paddw "OC_8",%%mm2\n\t" \
|
||||
OC_IDCT_BEGIN_10(_y,_y) \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
|
||||
/*r1=H'+H'*/ \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
@ -447,18 +434,18 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
||||
/*r1=NR1*/ \
|
||||
"psraw $4,%%mm1\n\t" \
|
||||
/*r3=D'*/ \
|
||||
"movq "OC_I(2)",%%mm3\n\t" \
|
||||
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||
/*r7=G+G*/ \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
/*Store NR2 at I(2).*/ \
|
||||
"movq %%mm2,"OC_I(2)"\n\t" \
|
||||
"movq %%mm2,"OC_I(2,_y)"\n\t" \
|
||||
/*r7=G'=E+G*/ \
|
||||
"paddw %%mm4,%%mm7\n\t" \
|
||||
/*Store NR1 at I(1).*/ \
|
||||
"movq %%mm1,"OC_I(1)"\n\t" \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
"psubw %%mm3,%%mm4\n\t" \
|
||||
"paddw "OC_8",%%mm4\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
|
||||
/*r3=D'+D'*/ \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
@ -469,7 +456,7 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
/*r3=NR3*/ \
|
||||
"psraw $4,%%mm3\n\t" \
|
||||
"paddw "OC_8",%%mm6\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
|
||||
/*r5=B''+B''*/ \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
@ -477,14 +464,14 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
||||
/*r6=NR6*/ \
|
||||
"psraw $4,%%mm6\n\t" \
|
||||
/*Store NR4 at J(4).*/ \
|
||||
"movq %%mm4,"OC_J(4)"\n\t" \
|
||||
"movq %%mm4,"OC_J(4,_y)"\n\t" \
|
||||
/*r5=NR5*/ \
|
||||
"psraw $4,%%mm5\n\t" \
|
||||
/*Store NR3 at I(3).*/ \
|
||||
"movq %%mm3,"OC_I(3)"\n\t" \
|
||||
"movq %%mm3,"OC_I(3,_y)"\n\t" \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"paddw "OC_8",%%mm7\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
|
||||
/*r0=C'+C'*/ \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
/*r0=R0=G'+C'*/ \
|
||||
@ -492,46 +479,57 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
||||
/*r7=NR7*/ \
|
||||
"psraw $4,%%mm7\n\t" \
|
||||
/*Store NR6 at J(6).*/ \
|
||||
"movq %%mm6,"OC_J(6)"\n\t" \
|
||||
"movq %%mm6,"OC_J(6,_y)"\n\t" \
|
||||
/*r0=NR0*/ \
|
||||
"psraw $4,%%mm0\n\t" \
|
||||
/*Store NR5 at J(5).*/ \
|
||||
"movq %%mm5,"OC_J(5)"\n\t" \
|
||||
"movq %%mm5,"OC_J(5,_y)"\n\t" \
|
||||
/*Store NR7 at J(7).*/ \
|
||||
"movq %%mm7,"OC_J(7)"\n\t" \
|
||||
"movq %%mm7,"OC_J(7,_y)"\n\t" \
|
||||
/*Store NR0 at I(0).*/ \
|
||||
"movq %%mm0,"OC_I(0)"\n\t" \
|
||||
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||
"#end OC_COLUMN_IDCT_10\n\t" \
|
||||
|
||||
static void oc_idct8x8_10(ogg_int16_t _y[64]){
|
||||
static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
__asm__ __volatile__(
|
||||
#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
|
||||
#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
|
||||
/*Done with dequant, descramble, and partial transpose.
|
||||
Now do the iDCT itself.*/
|
||||
OC_ROW_IDCT_10
|
||||
OC_TRANSPOSE
|
||||
OC_ROW_IDCT_10(y,x)
|
||||
OC_TRANSPOSE(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
|
||||
#define OC_J(_k) OC_I(_k)
|
||||
OC_COLUMN_IDCT_10
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT_10(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
|
||||
#define OC_J(_k) OC_I(_k)
|
||||
OC_COLUMN_IDCT_10
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT_10(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
:
|
||||
:[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
|
||||
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
|
||||
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
|
||||
[c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
|
||||
);
|
||||
if(_x!=_y){
|
||||
__asm__ __volatile__(
|
||||
"pxor %%mm0,%%mm0\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||
:[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.*/
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||
/*_last_zzi is subtly different from an actual count of the number of
|
||||
coefficients we decoded for this block.
|
||||
It contains the value of zzi BEFORE the final token in the block was
|
||||
@ -557,8 +555,8 @@ void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
|
||||
gets.
|
||||
Needless to say we inherited this approach from VP3.*/
|
||||
/*Then perform the iDCT.*/
|
||||
if(_last_zzi<10)oc_idct8x8_10(_y);
|
||||
else oc_idct8x8_slow(_y);
|
||||
if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
|
||||
else oc_idct8x8_slow_mmx(_y,_x);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -9,88 +9,191 @@
|
||||
On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
|
||||
mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
|
||||
#define OC_LOOP_FILTER8_MMX \
|
||||
"#OC_LOOP_FILTER8_MMX\n\t" \
|
||||
/*mm7=0*/ \
|
||||
"pxor %%mm7,%%mm7\n\t" \
|
||||
/*mm6:mm0={a0,...,a7}*/ \
|
||||
"movq %%mm0,%%mm6\n\t" \
|
||||
"punpcklbw %%mm7,%%mm0\n\t" \
|
||||
"punpckhbw %%mm7,%%mm6\n\t" \
|
||||
/*mm3:mm5={d0,...,d7}*/ \
|
||||
"movq %%mm3,%%mm5\n\t" \
|
||||
"punpcklbw %%mm7,%%mm3\n\t" \
|
||||
"punpckhbw %%mm7,%%mm5\n\t" \
|
||||
/*mm6:mm0={a0-d0,...,a7-d7}*/ \
|
||||
"psubw %%mm3,%%mm0\n\t" \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
/*mm3:mm1={b0,...,b7}*/ \
|
||||
"movq %%mm1,%%mm3\n\t" \
|
||||
"punpcklbw %%mm7,%%mm1\n\t" \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"punpckhbw %%mm7,%%mm3\n\t" \
|
||||
/*mm5:mm4={c0,...,c7}*/ \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklbw %%mm7,%%mm4\n\t" \
|
||||
"punpckhbw %%mm7,%%mm5\n\t" \
|
||||
/*mm7={3}x4 \
|
||||
mm5:mm4={c0-b0,...,c7-b7}*/ \
|
||||
"pcmpeqw %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm1,%%mm4\n\t" \
|
||||
"psrlw $14,%%mm7\n\t" \
|
||||
"psubw %%mm3,%%mm5\n\t" \
|
||||
/*Scale by 3.*/ \
|
||||
"pmullw %%mm7,%%mm4\n\t" \
|
||||
"pmullw %%mm7,%%mm5\n\t" \
|
||||
/*mm7={4}x4 \
|
||||
mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
|
||||
"psrlw $1,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"psllw $2,%%mm7\n\t" \
|
||||
"movq (%[ll]),%%mm0\n\t" \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*R_i has the range [-127,128], so we compute -R_i instead. \
|
||||
mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
"psubw %%mm7,%%mm5\n\t" \
|
||||
"psraw $3,%%mm4\n\t" \
|
||||
"psraw $3,%%mm5\n\t" \
|
||||
"pcmpeqb %%mm7,%%mm7\n\t" \
|
||||
"packsswb %%mm5,%%mm4\n\t" \
|
||||
"pxor %%mm6,%%mm6\n\t" \
|
||||
"pxor %%mm7,%%mm4\n\t" \
|
||||
"packuswb %%mm3,%%mm1\n\t" \
|
||||
/*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
|
||||
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
|
||||
we have to split things by sign (the other option is to work in 16 bits, \
|
||||
but working in 8 bits gives much better parallelism). \
|
||||
We compute abs(R_i), but save a mask of which terms were negative in mm6. \
|
||||
Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
|
||||
Finally, we split mm4 into positive and negative pieces using the mask in \
|
||||
mm6, and add and subtract them as appropriate.*/ \
|
||||
/*mm4=abs(-R_i)*/ \
|
||||
/*mm7=255-2*L*/ \
|
||||
"pcmpgtb %%mm4,%%mm6\n\t" \
|
||||
"psubb %%mm0,%%mm7\n\t" \
|
||||
"pxor %%mm6,%%mm4\n\t" \
|
||||
"psubb %%mm0,%%mm7\n\t" \
|
||||
"psubb %%mm6,%%mm4\n\t" \
|
||||
/*mm7=255-max(2*L-abs(R_i),0)*/ \
|
||||
"paddusb %%mm4,%%mm7\n\t" \
|
||||
/*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
|
||||
"paddusb %%mm7,%%mm4\n\t" \
|
||||
"psubusb %%mm7,%%mm4\n\t" \
|
||||
/*Now split mm4 by the original sign of -R_i.*/ \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"pand %%mm6,%%mm4\n\t" \
|
||||
"pandn %%mm5,%%mm6\n\t" \
|
||||
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
|
||||
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
|
||||
"paddusb %%mm4,%%mm1\n\t" \
|
||||
"psubusb %%mm4,%%mm2\n\t" \
|
||||
"psubusb %%mm6,%%mm1\n\t" \
|
||||
"paddusb %%mm6,%%mm2\n\t" \
|
||||
"#OC_LOOP_FILTER8_MMX\n\t" \
|
||||
/*mm7=0*/ \
|
||||
"pxor %%mm7,%%mm7\n\t" \
|
||||
/*mm6:mm0={a0,...,a7}*/ \
|
||||
"movq %%mm0,%%mm6\n\t" \
|
||||
"punpcklbw %%mm7,%%mm0\n\t" \
|
||||
"punpckhbw %%mm7,%%mm6\n\t" \
|
||||
/*mm3:mm5={d0,...,d7}*/ \
|
||||
"movq %%mm3,%%mm5\n\t" \
|
||||
"punpcklbw %%mm7,%%mm3\n\t" \
|
||||
"punpckhbw %%mm7,%%mm5\n\t" \
|
||||
/*mm6:mm0={a0-d0,...,a7-d7}*/ \
|
||||
"psubw %%mm3,%%mm0\n\t" \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
/*mm3:mm1={b0,...,b7}*/ \
|
||||
"movq %%mm1,%%mm3\n\t" \
|
||||
"punpcklbw %%mm7,%%mm1\n\t" \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"punpckhbw %%mm7,%%mm3\n\t" \
|
||||
/*mm5:mm4={c0,...,c7}*/ \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklbw %%mm7,%%mm4\n\t" \
|
||||
"punpckhbw %%mm7,%%mm5\n\t" \
|
||||
/*mm7={3}x4 \
|
||||
mm5:mm4={c0-b0,...,c7-b7}*/ \
|
||||
"pcmpeqw %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm1,%%mm4\n\t" \
|
||||
"psrlw $14,%%mm7\n\t" \
|
||||
"psubw %%mm3,%%mm5\n\t" \
|
||||
/*Scale by 3.*/ \
|
||||
"pmullw %%mm7,%%mm4\n\t" \
|
||||
"pmullw %%mm7,%%mm5\n\t" \
|
||||
/*mm7={4}x4 \
|
||||
mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
|
||||
"psrlw $1,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"psllw $2,%%mm7\n\t" \
|
||||
"movq (%[ll]),%%mm0\n\t" \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*R_i has the range [-127,128], so we compute -R_i instead. \
|
||||
mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
"psubw %%mm7,%%mm5\n\t" \
|
||||
"psraw $3,%%mm4\n\t" \
|
||||
"psraw $3,%%mm5\n\t" \
|
||||
"pcmpeqb %%mm7,%%mm7\n\t" \
|
||||
"packsswb %%mm5,%%mm4\n\t" \
|
||||
"pxor %%mm6,%%mm6\n\t" \
|
||||
"pxor %%mm7,%%mm4\n\t" \
|
||||
"packuswb %%mm3,%%mm1\n\t" \
|
||||
/*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
|
||||
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
|
||||
we have to split things by sign (the other option is to work in 16 bits, \
|
||||
but working in 8 bits gives much better parallelism). \
|
||||
We compute abs(R_i), but save a mask of which terms were negative in mm6. \
|
||||
Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
|
||||
Finally, we split mm4 into positive and negative pieces using the mask in \
|
||||
mm6, and add and subtract them as appropriate.*/ \
|
||||
/*mm4=abs(-R_i)*/ \
|
||||
/*mm7=255-2*L*/ \
|
||||
"pcmpgtb %%mm4,%%mm6\n\t" \
|
||||
"psubb %%mm0,%%mm7\n\t" \
|
||||
"pxor %%mm6,%%mm4\n\t" \
|
||||
"psubb %%mm0,%%mm7\n\t" \
|
||||
"psubb %%mm6,%%mm4\n\t" \
|
||||
/*mm7=255-max(2*L-abs(R_i),0)*/ \
|
||||
"paddusb %%mm4,%%mm7\n\t" \
|
||||
/*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
|
||||
"paddusb %%mm7,%%mm4\n\t" \
|
||||
"psubusb %%mm7,%%mm4\n\t" \
|
||||
/*Now split mm4 by the original sign of -R_i.*/ \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"pand %%mm6,%%mm4\n\t" \
|
||||
"pandn %%mm5,%%mm6\n\t" \
|
||||
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
|
||||
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
|
||||
"paddusb %%mm4,%%mm1\n\t" \
|
||||
"psubusb %%mm4,%%mm2\n\t" \
|
||||
"psubusb %%mm6,%%mm1\n\t" \
|
||||
"paddusb %%mm6,%%mm2\n\t" \
|
||||
|
||||
#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
|
||||
/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
|
||||
On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
|
||||
mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}.
|
||||
All other MMX registers are clobbered.*/
|
||||
#define OC_LOOP_FILTER8_MMXEXT \
|
||||
"#OC_LOOP_FILTER8_MMXEXT\n\t" \
|
||||
/*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \
|
||||
-R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \
|
||||
/*This first part is based on the transformation \
|
||||
f = -(3*(c-b)+a-d+4>>3) \
|
||||
= -(3*(c+255-b)+(a+255-d)+4-1020>>3) \
|
||||
= -(3*(c+~b)+(a+~d)-1016>>3) \
|
||||
= 127-(3*(c+~b)+(a+~d)>>3) \
|
||||
= 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \
|
||||
Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \
|
||||
fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \
|
||||
Using this, the last expression above can be computed in 8 bits of working \
|
||||
precision via: \
|
||||
u = ~pavgb(~b,c); \
|
||||
v = pavgb(b,~c); \
|
||||
This mask is 0 or 0xFF, and controls whether t is biased up or down: \
|
||||
m = u-v; \
|
||||
t = m^pavgb(m^~a,m^d); \
|
||||
f = 128+pavgb(pavgb(t,u),v); \
|
||||
This required some careful analysis to ensure that carries are propagated \
|
||||
correctly in all cases, but has been checked exhaustively.*/ \
|
||||
/*input (a, b, c, d, ., ., ., .)*/ \
|
||||
/*ff=0xFF; \
|
||||
u=b; \
|
||||
v=c; \
|
||||
ll=255-2*L;*/ \
|
||||
"pcmpeqb %%mm7,%%mm7\n\t" \
|
||||
"movq %%mm1,%%mm4\n\t" \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"movq (%[ll]),%%mm6\n\t" \
|
||||
/*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \
|
||||
/*u^=ff; \
|
||||
v^=ff;*/ \
|
||||
"pxor %%mm7,%%mm4\n\t" \
|
||||
"pxor %%mm7,%%mm5\n\t" \
|
||||
/*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \
|
||||
/*u=pavgb(u,c); \
|
||||
v=pavgb(v,b);*/ \
|
||||
"pavgb %%mm2,%%mm4\n\t" \
|
||||
"pavgb %%mm1,%%mm5\n\t" \
|
||||
/*u^=ff; \
|
||||
a^=ff;*/ \
|
||||
"pxor %%mm7,%%mm4\n\t" \
|
||||
"pxor %%mm7,%%mm0\n\t" \
|
||||
/*m=u-v;*/ \
|
||||
"psubb %%mm5,%%mm4\n\t" \
|
||||
/*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \
|
||||
/*a^=m; \
|
||||
d^=m;*/ \
|
||||
"pxor %%mm4,%%mm0\n\t" \
|
||||
"pxor %%mm4,%%mm3\n\t" \
|
||||
/*t=pavgb(a,d);*/ \
|
||||
"pavgb %%mm3,%%mm0\n\t" \
|
||||
"psllw $7,%%mm7\n\t" \
|
||||
/*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \
|
||||
/*t^=m; \
|
||||
u=m+v;*/ \
|
||||
"pxor %%mm4,%%mm0\n\t" \
|
||||
"paddb %%mm5,%%mm4\n\t" \
|
||||
/*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \
|
||||
/*f=pavgb(f,u); \
|
||||
of=128;*/ \
|
||||
"pavgb %%mm4,%%mm0\n\t" \
|
||||
"packsswb %%mm7,%%mm7\n\t" \
|
||||
/*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \
|
||||
/*f=pavgb(f,v);*/ \
|
||||
"pavgb %%mm5,%%mm0\n\t" \
|
||||
"movq %%mm7,%%mm3\n\t" \
|
||||
"movq %%mm6,%%mm4\n\t" \
|
||||
/*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \
|
||||
/*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \
|
||||
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
|
||||
we have to split things by sign (the other option is to work in 16 bits, \
|
||||
but staying in 8 bits gives much better parallelism).*/ \
|
||||
/*Instead of adding the offset of 128 in mm3, we use it to split mm0. \
|
||||
This is the same number of instructions as computing a mask and splitting \
|
||||
after the lflim computation, but has shorter dependency chains.*/ \
|
||||
/*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\
|
||||
mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \
|
||||
"psubusb %%mm0,%%mm3\n\t" \
|
||||
"psubusb %%mm7,%%mm0\n\t" \
|
||||
/*mm6=255-max(2*L-abs(R_i<0),0) \
|
||||
mm4=255-max(2*L-abs(R_i>0),0)*/ \
|
||||
"paddusb %%mm3,%%mm4\n\t" \
|
||||
"paddusb %%mm0,%%mm6\n\t" \
|
||||
/*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \
|
||||
mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \
|
||||
"paddusb %%mm4,%%mm3\n\t" \
|
||||
"paddusb %%mm6,%%mm0\n\t" \
|
||||
"psubusb %%mm4,%%mm3\n\t" \
|
||||
"psubusb %%mm6,%%mm0\n\t" \
|
||||
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
|
||||
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
|
||||
"paddusb %%mm3,%%mm1\n\t" \
|
||||
"psubusb %%mm3,%%mm2\n\t" \
|
||||
"psubusb %%mm0,%%mm1\n\t" \
|
||||
"paddusb %%mm0,%%mm2\n\t" \
|
||||
|
||||
#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \
|
||||
do{ \
|
||||
ptrdiff_t ystride3__; \
|
||||
__asm__ __volatile__( \
|
||||
@ -104,7 +207,7 @@
|
||||
"movq (%[pix],%[ystride]),%%mm1\n\t" \
|
||||
/*mm2={c0,...,c7}*/ \
|
||||
"movq (%[pix],%[ystride],2),%%mm2\n\t" \
|
||||
OC_LOOP_FILTER8_MMX \
|
||||
_filter \
|
||||
/*Write it back out.*/ \
|
||||
"movq %%mm1,(%[pix],%[ystride])\n\t" \
|
||||
"movq %%mm2,(%[pix],%[ystride],2)\n\t" \
|
||||
@ -116,7 +219,7 @@
|
||||
} \
|
||||
while(0)
|
||||
|
||||
#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
|
||||
#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \
|
||||
do{ \
|
||||
unsigned char *pix__; \
|
||||
ptrdiff_t ystride3__; \
|
||||
@ -174,7 +277,7 @@
|
||||
"punpckldq %%mm5,%%mm2\n\t" \
|
||||
/*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
|
||||
"punpckhdq %%mm5,%%mm3\n\t" \
|
||||
OC_LOOP_FILTER8_MMX \
|
||||
_filter \
|
||||
/*mm2={b0+R_0'',...,b7+R_7''}*/ \
|
||||
"movq %%mm1,%%mm0\n\t" \
|
||||
/*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
|
||||
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: mmxstate.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -19,23 +19,23 @@
|
||||
Originally written by Rudolf Marek.*/
|
||||
#include <string.h>
|
||||
#include "x86int.h"
|
||||
#include "mmxfrag.h"
|
||||
#include "mmxloop.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||
unsigned char *dst;
|
||||
ptrdiff_t frag_buf_off;
|
||||
int ystride;
|
||||
int mb_mode;
|
||||
int refi;
|
||||
/*Apply the inverse transform.*/
|
||||
/*Special case only having a DC component.*/
|
||||
if(_last_zzi<2){
|
||||
/*Note that this value must be unsigned, to keep the __asm__ block from
|
||||
sign-extending it when it puts it in a register.*/
|
||||
ogg_uint16_t p;
|
||||
int i;
|
||||
/*We round this dequant product (and not any of the others) because there's
|
||||
no iDCT rounding.*/
|
||||
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
|
||||
@ -47,81 +47,48 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
"punpcklwd %%mm0,%%mm0\n\t"
|
||||
/*mm0=AAAA AAAA AAAA AAAA*/
|
||||
"punpckldq %%mm0,%%mm0\n\t"
|
||||
"movq %%mm0,(%[y])\n\t"
|
||||
"movq %%mm0,8(%[y])\n\t"
|
||||
"movq %%mm0,16(%[y])\n\t"
|
||||
"movq %%mm0,24(%[y])\n\t"
|
||||
"movq %%mm0,32(%[y])\n\t"
|
||||
"movq %%mm0,40(%[y])\n\t"
|
||||
"movq %%mm0,48(%[y])\n\t"
|
||||
"movq %%mm0,56(%[y])\n\t"
|
||||
"movq %%mm0,64(%[y])\n\t"
|
||||
"movq %%mm0,72(%[y])\n\t"
|
||||
"movq %%mm0,80(%[y])\n\t"
|
||||
"movq %%mm0,88(%[y])\n\t"
|
||||
"movq %%mm0,96(%[y])\n\t"
|
||||
"movq %%mm0,104(%[y])\n\t"
|
||||
"movq %%mm0,112(%[y])\n\t"
|
||||
"movq %%mm0,120(%[y])\n\t"
|
||||
:
|
||||
:[y]"r"(_dct_coeffs),[p]"r"((unsigned)p)
|
||||
:"memory"
|
||||
:[p]"r"((unsigned)p)
|
||||
);
|
||||
for(i=0;i<4;i++){
|
||||
__asm__ __volatile__(
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
|
||||
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
|
||||
);
|
||||
}
|
||||
}
|
||||
else{
|
||||
/*Dequantize the DC coefficient.*/
|
||||
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
||||
oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
|
||||
oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
|
||||
}
|
||||
/*Fill in the target buffer.*/
|
||||
frag_buf_off=_state->frag_buf_offs[_fragi];
|
||||
mb_mode=_state->frags[_fragi].mb_mode;
|
||||
refi=_state->frags[_fragi].refi;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
|
||||
if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
|
||||
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
|
||||
if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
|
||||
else{
|
||||
const unsigned char *ref;
|
||||
int mvoffsets[2];
|
||||
ref=
|
||||
_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
|
||||
+frag_buf_off;
|
||||
ref=_state->ref_frame_data[refi]+frag_buf_off;
|
||||
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
||||
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
|
||||
_state->frag_mvs[_fragi])>1){
|
||||
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
|
||||
_dct_coeffs);
|
||||
_dct_coeffs+64);
|
||||
}
|
||||
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
|
||||
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
|
||||
}
|
||||
}
|
||||
|
||||
/*We copy these entire function to inline the actual MMX routines so that we
|
||||
use only a single indirect call.*/
|
||||
|
||||
/*Copies the fragments specified by the lists of fragment indices from one
|
||||
frame to another.
|
||||
_fragis: A pointer to a list of fragment indices.
|
||||
_nfragis: The number of fragment indices to copy.
|
||||
_dst_frame: The reference frame to copy to.
|
||||
_src_frame: The reference frame to copy from.
|
||||
_pli: The color plane the fragments lie in.*/
|
||||
void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
||||
int _dst_frame,int _src_frame,int _pli){
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
const unsigned char *src_frame_data;
|
||||
unsigned char *dst_frame_data;
|
||||
ptrdiff_t fragii;
|
||||
int ystride;
|
||||
dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
|
||||
src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
frag_buf_offs=_state->frag_buf_offs;
|
||||
for(fragii=0;fragii<_nfragis;fragii++){
|
||||
ptrdiff_t frag_buf_off;
|
||||
frag_buf_off=frag_buf_offs[_fragis[fragii]];
|
||||
OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
|
||||
src_frame_data+frag_buf_off,ystride);
|
||||
}
|
||||
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
|
||||
memset(_bv,_flimit,8);
|
||||
}
|
||||
|
||||
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
||||
@ -133,7 +100,7 @@ void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
|
||||
_fragy0: The Y coordinate of the first fragment row to filter.
|
||||
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||
OC_ALIGN8(unsigned char ll[8]);
|
||||
const oc_fragment_plane *fplane;
|
||||
const oc_fragment *frags;
|
||||
@ -170,13 +137,84 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||
if(frags[fragi].coded){
|
||||
unsigned char *ref;
|
||||
ref=ref_frame_data+frag_buf_offs[fragi];
|
||||
if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
|
||||
if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
|
||||
if(fragi>fragi0){
|
||||
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
|
||||
}
|
||||
if(fragi0>fragi_top){
|
||||
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
|
||||
}
|
||||
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
|
||||
OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
|
||||
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
|
||||
}
|
||||
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
|
||||
OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
|
||||
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll);
|
||||
}
|
||||
}
|
||||
fragi++;
|
||||
}
|
||||
fragi0+=nhfrags;
|
||||
}
|
||||
}
|
||||
|
||||
void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
|
||||
memset(_bv,~(_flimit<<1),8);
|
||||
}
|
||||
|
||||
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
||||
The filter may be run on the bottom edge, affecting pixels in the next row of
|
||||
fragments, so this row also needs to be available.
|
||||
_bv: The bounding values array.
|
||||
_refi: The index of the frame buffer to filter.
|
||||
_pli: The color plane to filter.
|
||||
_fragy0: The Y coordinate of the first fragment row to filter.
|
||||
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
||||
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||
const oc_fragment_plane *fplane;
|
||||
const oc_fragment *frags;
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
unsigned char *ref_frame_data;
|
||||
ptrdiff_t fragi_top;
|
||||
ptrdiff_t fragi_bot;
|
||||
ptrdiff_t fragi0;
|
||||
ptrdiff_t fragi0_end;
|
||||
int ystride;
|
||||
int nhfrags;
|
||||
fplane=_state->fplanes+_pli;
|
||||
nhfrags=fplane->nhfrags;
|
||||
fragi_top=fplane->froffset;
|
||||
fragi_bot=fragi_top+fplane->nfrags;
|
||||
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
|
||||
fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
frags=_state->frags;
|
||||
frag_buf_offs=_state->frag_buf_offs;
|
||||
ref_frame_data=_state->ref_frame_data[_refi];
|
||||
/*The following loops are constructed somewhat non-intuitively on purpose.
|
||||
The main idea is: if a block boundary has at least one coded fragment on
|
||||
it, the filter is applied to it.
|
||||
However, the order that the filters are applied in matters, and VP3 chose
|
||||
the somewhat strange ordering used below.*/
|
||||
while(fragi0<fragi0_end){
|
||||
ptrdiff_t fragi;
|
||||
ptrdiff_t fragi_end;
|
||||
fragi=fragi0;
|
||||
fragi_end=fragi+nhfrags;
|
||||
while(fragi<fragi_end){
|
||||
if(frags[fragi].coded){
|
||||
unsigned char *ref;
|
||||
ref=ref_frame_data+frag_buf_offs[fragi];
|
||||
if(fragi>fragi0){
|
||||
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
|
||||
}
|
||||
if(fragi0>fragi_top){
|
||||
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
|
||||
}
|
||||
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
|
||||
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
|
||||
}
|
||||
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
|
||||
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
|
||||
}
|
||||
}
|
||||
fragi++;
|
||||
|
460
media/libtheora/lib/x86/sse2idct.c
Normal file
460
media/libtheora/lib/x86/sse2idct.c
Normal file
@ -0,0 +1,460 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*SSE2 acceleration of Theora's iDCT.*/
|
||||
#include "x86int.h"
|
||||
#include "sse2trans.h"
|
||||
#include "../dct.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*A table of constants used by the MMX routines.*/
|
||||
const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
|
||||
8, 8, 8, 8, 8, 8, 8, 8,
|
||||
OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
|
||||
OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
|
||||
OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
|
||||
OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
|
||||
OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
|
||||
OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
|
||||
OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
|
||||
};
|
||||
|
||||
|
||||
/*Performs the first three stages of the iDCT.
|
||||
xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
|
||||
(accessed in that order).
|
||||
The remaining rows must be in _x at their corresponding locations.
|
||||
On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
|
||||
contain rows 4 through 7.*/
|
||||
#define OC_IDCT_8x8_ABC(_x) \
|
||||
"#OC_IDCT_8x8_ABC\n\t" \
|
||||
/*Stage 1:*/ \
|
||||
/*2-3 rotation by 6pi/16. \
|
||||
xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
|
||||
"movdqa %%xmm1,%%xmm0\n\t" \
|
||||
"pmulhw %%xmm2,%%xmm1\n\t" \
|
||||
"movdqa %%xmm4,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm0\n\t" \
|
||||
"pmulhw %%xmm2,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm4\n\t" \
|
||||
"paddw %%xmm6,%%xmm0\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
|
||||
"paddw %%xmm1,%%xmm2\n\t" \
|
||||
"psubw %%xmm0,%%xmm7\n\t" \
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
"paddw %%xmm4,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
/*5-6 rotation by 3pi/16. \
|
||||
xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
|
||||
"movdqa %%xmm4,%%xmm2\n\t" \
|
||||
"movdqa %%xmm6,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm3,%%xmm4\n\t" \
|
||||
"pmulhw %%xmm5,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm3,%%xmm6\n\t" \
|
||||
"pmulhw %%xmm5,%%xmm2\n\t" \
|
||||
"paddw %%xmm3,%%xmm4\n\t" \
|
||||
"paddw %%xmm5,%%xmm3\n\t" \
|
||||
"paddw %%xmm6,%%xmm3\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
|
||||
"paddw %%xmm5,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
|
||||
"paddw %%xmm3,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
|
||||
"psubw %%xmm4,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
|
||||
/*4-7 rotation by 7pi/16. \
|
||||
xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
|
||||
"movdqa %%xmm3,%%xmm0\n\t" \
|
||||
"movdqa %%xmm4,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm5,%%xmm3\n\t" \
|
||||
"pmulhw %%xmm5,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm4\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm0\n\t" \
|
||||
"paddw %%xmm6,%%xmm4\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
|
||||
"paddw %%xmm5,%%xmm7\n\t" \
|
||||
"psubw %%xmm4,%%xmm3\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
|
||||
"paddw %%xmm7,%%xmm0\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
|
||||
/*0-1 butterfly. \
|
||||
xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
|
||||
"paddw %%xmm7,%%xmm6\n\t" \
|
||||
"movdqa %%xmm4,%%xmm5\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm4\n\t" \
|
||||
"paddw %%xmm7,%%xmm7\n\t" \
|
||||
"psubw %%xmm6,%%xmm7\n\t" \
|
||||
"paddw %%xmm6,%%xmm4\n\t" \
|
||||
/*Stage 2:*/ \
|
||||
/*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
|
||||
7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
|
||||
"movdqa %%xmm3,%%xmm6\n\t" \
|
||||
"paddw %%xmm1,%%xmm3\n\t" \
|
||||
"psubw %%xmm1,%%xmm6\n\t" \
|
||||
"movdqa %%xmm5,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm7,%%xmm5\n\t" \
|
||||
"paddw %%xmm7,%%xmm5\n\t" \
|
||||
"movdqa %%xmm0,%%xmm7\n\t" \
|
||||
"paddw %%xmm2,%%xmm0\n\t" \
|
||||
"psubw %%xmm2,%%xmm7\n\t" \
|
||||
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm7,%%xmm2\n\t" \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
|
||||
"paddw %%xmm7,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
|
||||
/*Stage 3: \
|
||||
6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
|
||||
0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
|
||||
1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
|
||||
"paddw %%xmm2,%%xmm1\n\t" \
|
||||
"paddw %%xmm5,%%xmm6\n\t" \
|
||||
"paddw %%xmm4,%%xmm7\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm4,%%xmm4\n\t" \
|
||||
"paddw %%xmm5,%%xmm5\n\t" \
|
||||
"psubw %%xmm1,%%xmm2\n\t" \
|
||||
"psubw %%xmm7,%%xmm4\n\t" \
|
||||
"psubw %%xmm6,%%xmm5\n\t" \
|
||||
|
||||
/*Performs the last stage of the iDCT.
|
||||
On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
|
||||
contain rows 4 through 7.
|
||||
On output, xmm0 through xmm7 contain the corresponding rows.*/
|
||||
#define OC_IDCT_8x8_D \
|
||||
"#OC_IDCT_8x8_D\n\t" \
|
||||
/*Stage 4: \
|
||||
0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
|
||||
1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
|
||||
2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
|
||||
3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
|
||||
"psubw %%xmm0,%%xmm7\n\t" \
|
||||
"psubw %%xmm1,%%xmm6\n\t" \
|
||||
"psubw %%xmm2,%%xmm5\n\t" \
|
||||
"psubw %%xmm3,%%xmm4\n\t" \
|
||||
"paddw %%xmm0,%%xmm0\n\t" \
|
||||
"paddw %%xmm1,%%xmm1\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm3,%%xmm3\n\t" \
|
||||
"paddw %%xmm7,%%xmm0\n\t" \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
"paddw %%xmm5,%%xmm2\n\t" \
|
||||
"paddw %%xmm4,%%xmm3\n\t" \
|
||||
|
||||
/*Performs the last stage of the iDCT.
|
||||
On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
|
||||
contain rows 4 through 7.
|
||||
On output, xmm0 through xmm7 contain the corresponding rows.*/
|
||||
#define OC_IDCT_8x8_D_STORE \
|
||||
"#OC_IDCT_8x8_D_STORE\n\t" \
|
||||
/*Stage 4: \
|
||||
0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
|
||||
1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
|
||||
2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
|
||||
3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
|
||||
"psubw %%xmm3,%%xmm4\n\t" \
|
||||
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
|
||||
"psubw %%xmm0,%%xmm7\n\t" \
|
||||
"psubw %%xmm1,%%xmm6\n\t" \
|
||||
"psubw %%xmm2,%%xmm5\n\t" \
|
||||
"paddw %%xmm4,%%xmm7\n\t" \
|
||||
"paddw %%xmm4,%%xmm6\n\t" \
|
||||
"paddw %%xmm4,%%xmm5\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
|
||||
"paddw %%xmm0,%%xmm0\n\t" \
|
||||
"paddw %%xmm1,%%xmm1\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm3,%%xmm3\n\t" \
|
||||
"paddw %%xmm7,%%xmm0\n\t" \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
"psraw $4,%%xmm0\n\t" \
|
||||
"paddw %%xmm5,%%xmm2\n\t" \
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
|
||||
"psraw $4,%%xmm1\n\t" \
|
||||
"paddw %%xmm4,%%xmm3\n\t" \
|
||||
"movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
|
||||
"psraw $4,%%xmm2\n\t" \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
|
||||
"psraw $4,%%xmm3\n\t" \
|
||||
"movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
|
||||
"psraw $4,%%xmm4\n\t" \
|
||||
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
|
||||
"psraw $4,%%xmm5\n\t" \
|
||||
"movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
|
||||
"psraw $4,%%xmm6\n\t" \
|
||||
"movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
|
||||
"psraw $4,%%xmm7\n\t" \
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
|
||||
|
||||
static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
/*This routine accepts an 8x8 matrix pre-transposed.*/
|
||||
__asm__ __volatile__(
|
||||
/*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
|
||||
"movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
|
||||
"movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
|
||||
"movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
|
||||
"movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
|
||||
OC_IDCT_8x8_ABC(x)
|
||||
OC_IDCT_8x8_D
|
||||
OC_TRANSPOSE_8x8
|
||||
/*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
|
||||
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
|
||||
"movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
|
||||
OC_IDCT_8x8_ABC(y)
|
||||
OC_IDCT_8x8_D_STORE
|
||||
:[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
|
||||
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
|
||||
:[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
|
||||
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
|
||||
);
|
||||
if(_x!=_y){
|
||||
int i;
|
||||
__asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
|
||||
/*Clear input data for next block (decoder only).*/
|
||||
for(i=0;i<2;i++){
|
||||
__asm__ __volatile__(
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||
:[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
|
||||
need to work with four columns at a time.
|
||||
Doing this in MMX is faster on processors with a 64-bit data path.*/
|
||||
#define OC_IDCT_8x8_10_MMX \
|
||||
"#OC_IDCT_8x8_10_MMX\n\t" \
|
||||
/*Stage 1:*/ \
|
||||
/*2-3 rotation by 6pi/16. \
|
||||
mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
|
||||
"movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
|
||||
"pmulhw %%mm2,%%mm6\n\t" \
|
||||
"pmulhw %%mm2,%%mm7\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
|
||||
"paddw %%mm6,%%mm2\n\t" \
|
||||
"movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
|
||||
"movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*5-6 rotation by 3pi/16. \
|
||||
mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
|
||||
"pmulhw %%mm3,%%mm5\n\t" \
|
||||
"pmulhw %%mm3,%%mm2\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
|
||||
"paddw %%mm3,%%mm5\n\t" \
|
||||
"paddw %%mm3,%%mm2\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
|
||||
/*4-7 rotation by 7pi/16. \
|
||||
mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
|
||||
"pmulhw %%mm1,%%mm3\n\t" \
|
||||
"pmulhw %%mm1,%%mm7\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
|
||||
"movq %%mm3,%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm7\n\t" \
|
||||
/*0-1 butterfly. \
|
||||
mm4=C4, mm0=X0, X4=0.*/ \
|
||||
/*Stage 2:*/ \
|
||||
/*4-5 butterfly: mm3=t[4], mm5=t[5] \
|
||||
7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
|
||||
"psubw %%mm5,%%mm3\n\t" \
|
||||
"paddw %%mm5,%%mm6\n\t" \
|
||||
"movq %%mm4,%%mm1\n\t" \
|
||||
"pmulhw %%mm0,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"movq %%mm7,%%mm0\n\t" \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"paddw %%mm2,%%mm0\n\t" \
|
||||
"psubw %%mm2,%%mm7\n\t" \
|
||||
"movq %%mm1,%%mm2\n\t" \
|
||||
"pmulhw %%mm6,%%mm1\n\t" \
|
||||
"pmulhw %%mm7,%%mm2\n\t" \
|
||||
"paddw %%mm6,%%mm1\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
|
||||
"paddw %%mm7,%%mm2\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
|
||||
/*Stage 3: \
|
||||
6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
|
||||
0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
|
||||
1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
|
||||
"paddw %%mm2,%%mm1\n\t" \
|
||||
"paddw %%mm5,%%mm6\n\t" \
|
||||
"paddw %%mm4,%%mm7\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"paddw %%mm4,%%mm4\n\t" \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
"psubw %%mm1,%%mm2\n\t" \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
"psubw %%mm6,%%mm5\n\t" \
|
||||
/*Stage 4: \
|
||||
0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
|
||||
1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
|
||||
2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
|
||||
3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"psubw %%mm1,%%mm6\n\t" \
|
||||
"psubw %%mm2,%%mm5\n\t" \
|
||||
"psubw %%mm3,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"paddw %%mm6,%%mm1\n\t" \
|
||||
"paddw %%mm5,%%mm2\n\t" \
|
||||
"paddw %%mm4,%%mm3\n\t" \
|
||||
|
||||
#define OC_IDCT_8x8_10_ABC \
|
||||
"#OC_IDCT_8x8_10_ABC\n\t" \
|
||||
/*Stage 1:*/ \
|
||||
/*2-3 rotation by 6pi/16. \
|
||||
xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
|
||||
"pmulhw %%xmm2,%%xmm6\n\t" \
|
||||
"pmulhw %%xmm2,%%xmm7\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
|
||||
"paddw %%xmm6,%%xmm2\n\t" \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*5-6 rotation by 3pi/16. \
|
||||
xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
|
||||
"pmulhw %%xmm3,%%xmm5\n\t" \
|
||||
"pmulhw %%xmm3,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
|
||||
"paddw %%xmm3,%%xmm5\n\t" \
|
||||
"paddw %%xmm3,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
|
||||
/*4-7 rotation by 7pi/16. \
|
||||
xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
|
||||
"pmulhw %%xmm1,%%xmm3\n\t" \
|
||||
"pmulhw %%xmm1,%%xmm7\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
|
||||
"movdqa %%xmm3,%%xmm6\n\t" \
|
||||
"paddw %%xmm1,%%xmm7\n\t" \
|
||||
/*0-1 butterfly. \
|
||||
xmm4=C4, xmm0=X0, X4=0.*/ \
|
||||
/*Stage 2:*/ \
|
||||
/*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
|
||||
7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
|
||||
"psubw %%xmm5,%%xmm3\n\t" \
|
||||
"paddw %%xmm5,%%xmm6\n\t" \
|
||||
"movdqa %%xmm4,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm0,%%xmm4\n\t" \
|
||||
"paddw %%xmm0,%%xmm4\n\t" \
|
||||
"movdqa %%xmm7,%%xmm0\n\t" \
|
||||
"movdqa %%xmm4,%%xmm5\n\t" \
|
||||
"paddw %%xmm2,%%xmm0\n\t" \
|
||||
"psubw %%xmm2,%%xmm7\n\t" \
|
||||
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm7,%%xmm2\n\t" \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
|
||||
"paddw %%xmm7,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
|
||||
/*Stage 3: \
|
||||
6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
|
||||
0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
|
||||
1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
|
||||
"paddw %%xmm2,%%xmm1\n\t" \
|
||||
"paddw %%xmm5,%%xmm6\n\t" \
|
||||
"paddw %%xmm4,%%xmm7\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm4,%%xmm4\n\t" \
|
||||
"paddw %%xmm5,%%xmm5\n\t" \
|
||||
"psubw %%xmm1,%%xmm2\n\t" \
|
||||
"psubw %%xmm7,%%xmm4\n\t" \
|
||||
"psubw %%xmm6,%%xmm5\n\t" \
|
||||
|
||||
static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
/*This routine accepts an 8x8 matrix pre-transposed.*/
|
||||
__asm__ __volatile__(
|
||||
"movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
|
||||
"movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
|
||||
"movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
|
||||
"movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
|
||||
OC_IDCT_8x8_10_MMX
|
||||
OC_TRANSPOSE_8x4_MMX2SSE
|
||||
OC_IDCT_8x8_10_ABC
|
||||
OC_IDCT_8x8_D_STORE
|
||||
:[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
|
||||
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
|
||||
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
|
||||
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
|
||||
);
|
||||
if(_x!=_y){
|
||||
/*Clear input data for next block (decoder only).*/
|
||||
__asm__ __volatile__(
|
||||
"pxor %%mm0,%%mm0\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||
:[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.*/
|
||||
void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||
/*_last_zzi is subtly different from an actual count of the number of
|
||||
coefficients we decoded for this block.
|
||||
It contains the value of zzi BEFORE the final token in the block was
|
||||
decoded.
|
||||
In most cases this is an EOB token (the continuation of an EOB run from a
|
||||
previous block counts), and so this is the same as the coefficient count.
|
||||
However, in the case that the last token was NOT an EOB token, but filled
|
||||
the block up with exactly 64 coefficients, _last_zzi will be less than 64.
|
||||
Provided the last token was not a pure zero run, the minimum value it can
|
||||
be is 46, and so that doesn't affect any of the cases in this routine.
|
||||
However, if the last token WAS a pure zero run of length 63, then _last_zzi
|
||||
will be 1 while the number of coefficients decoded is 64.
|
||||
Thus, we will trigger the following special case, where the real
|
||||
coefficient count would not.
|
||||
Note also that a zero run of length 64 will give _last_zzi a value of 0,
|
||||
but we still process the DC coefficient, which might have a non-zero value
|
||||
due to DC prediction.
|
||||
Although convoluted, this is arguably the correct behavior: it allows us to
|
||||
use a smaller transform when the block ends with a long zero run instead
|
||||
of a normal EOB token.
|
||||
It could be smarter... multiple separate zero runs at the end of a block
|
||||
will fool it, but an encoder that generates these really deserves what it
|
||||
gets.
|
||||
Needless to say we inherited this approach from VP3.*/
|
||||
/*Then perform the iDCT.*/
|
||||
if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
|
||||
else oc_idct8x8_slow_sse2(_y,_x);
|
||||
}
|
||||
|
||||
#endif
|
242
media/libtheora/lib/x86/sse2trans.h
Normal file
242
media/libtheora/lib/x86/sse2trans.h
Normal file
@ -0,0 +1,242 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_sse2trans_H)
|
||||
# define _x86_sse2trans_H (1)
|
||||
# include "x86int.h"
|
||||
|
||||
# if defined(OC_X86_64_ASM)
|
||||
/*On x86-64 we can transpose in-place without spilling registers.
|
||||
By clever choices of the order to apply the butterflies and the order of
|
||||
their outputs, we can take the rows in order and output the columns in order
|
||||
without any extra operations and using just one temporary register.*/
|
||||
# define OC_TRANSPOSE_8x8 \
|
||||
"#OC_TRANSPOSE_8x8\n\t" \
|
||||
"movdqa %%xmm4,%%xmm8\n\t" \
|
||||
/*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
|
||||
"punpcklwd %%xmm5,%%xmm4\n\t" \
|
||||
/*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
|
||||
"punpckhwd %%xmm5,%%xmm8\n\t" \
|
||||
/*xmm5 is free.*/ \
|
||||
"movdqa %%xmm0,%%xmm5\n\t" \
|
||||
/*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
|
||||
"punpcklwd %%xmm1,%%xmm0\n\t" \
|
||||
/*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
|
||||
"punpckhwd %%xmm1,%%xmm5\n\t" \
|
||||
/*xmm1 is free.*/ \
|
||||
"movdqa %%xmm6,%%xmm1\n\t" \
|
||||
/*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
|
||||
"punpcklwd %%xmm7,%%xmm6\n\t" \
|
||||
/*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
|
||||
"punpckhwd %%xmm7,%%xmm1\n\t" \
|
||||
/*xmm7 is free.*/ \
|
||||
"movdqa %%xmm2,%%xmm7\n\t" \
|
||||
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||
"punpckhwd %%xmm3,%%xmm2\n\t" \
|
||||
/*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
|
||||
"punpcklwd %%xmm3,%%xmm7\n\t" \
|
||||
/*xmm3 is free.*/ \
|
||||
"movdqa %%xmm0,%%xmm3\n\t" \
|
||||
/*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
|
||||
"punpckldq %%xmm7,%%xmm0\n\t" \
|
||||
/*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
|
||||
"punpckhdq %%xmm7,%%xmm3\n\t" \
|
||||
/*xmm7 is free.*/ \
|
||||
"movdqa %%xmm5,%%xmm7\n\t" \
|
||||
/*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
|
||||
"punpckldq %%xmm2,%%xmm5\n\t" \
|
||||
/*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
|
||||
"punpckhdq %%xmm2,%%xmm7\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm4,%%xmm2\n\t" \
|
||||
/*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
|
||||
"punpckhdq %%xmm6,%%xmm4\n\t" \
|
||||
/*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
|
||||
"punpckldq %%xmm6,%%xmm2\n\t" \
|
||||
/*xmm6 is free.*/ \
|
||||
"movdqa %%xmm8,%%xmm6\n\t" \
|
||||
/*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
|
||||
"punpckldq %%xmm1,%%xmm6\n\t" \
|
||||
/*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||
"punpckhdq %%xmm1,%%xmm8\n\t" \
|
||||
/*xmm1 is free.*/ \
|
||||
"movdqa %%xmm0,%%xmm1\n\t" \
|
||||
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"punpcklqdq %%xmm2,%%xmm0\n\t" \
|
||||
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
|
||||
"punpckhqdq %%xmm2,%%xmm1\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm3,%%xmm2\n\t" \
|
||||
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
|
||||
"punpckhqdq %%xmm4,%%xmm3\n\t" \
|
||||
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
|
||||
"punpcklqdq %%xmm4,%%xmm2\n\t" \
|
||||
/*xmm4 is free.*/ \
|
||||
"movdqa %%xmm5,%%xmm4\n\t" \
|
||||
/*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
|
||||
"punpckhqdq %%xmm6,%%xmm5\n\t" \
|
||||
/*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
|
||||
"punpcklqdq %%xmm6,%%xmm4\n\t" \
|
||||
/*xmm6 is free.*/ \
|
||||
"movdqa %%xmm7,%%xmm6\n\t" \
|
||||
/*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
|
||||
"punpckhqdq %%xmm8,%%xmm7\n\t" \
|
||||
/*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
|
||||
"punpcklqdq %%xmm8,%%xmm6\n\t" \
|
||||
/*xmm8 is free.*/ \
|
||||
|
||||
# else
|
||||
/*Otherwise, we need to spill some values to %[buf] temporarily.
|
||||
Again, the butterflies are carefully arranged to get the columns to come out
|
||||
in order, minimizing register spills and maximizing the delay between a load
|
||||
and when the value loaded is actually used.*/
|
||||
# define OC_TRANSPOSE_8x8 \
|
||||
"#OC_TRANSPOSE_8x8\n\t" \
|
||||
/*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*xmm0 is free.*/ \
|
||||
"movdqa %%xmm2,%%xmm0\n\t" \
|
||||
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||
"punpckhwd %%xmm3,%%xmm2\n\t" \
|
||||
/*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
|
||||
"punpcklwd %%xmm3,%%xmm0\n\t" \
|
||||
/*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \
|
||||
/*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm6,%%xmm2\n\t" \
|
||||
/*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
|
||||
"punpcklwd %%xmm7,%%xmm6\n\t" \
|
||||
/*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
|
||||
"punpckhwd %%xmm7,%%xmm2\n\t" \
|
||||
/*xmm7 is free.*/ \
|
||||
"movdqa %%xmm4,%%xmm7\n\t" \
|
||||
/*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
|
||||
"punpcklwd %%xmm5,%%xmm4\n\t" \
|
||||
/*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
|
||||
"punpckhwd %%xmm5,%%xmm7\n\t" \
|
||||
/*xmm5 is free.*/ \
|
||||
"movdqa %%xmm3,%%xmm5\n\t" \
|
||||
/*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
|
||||
"punpcklwd %%xmm1,%%xmm3\n\t" \
|
||||
/*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
|
||||
"punpckhwd %%xmm1,%%xmm5\n\t" \
|
||||
/*xmm1 is free.*/ \
|
||||
"movdqa %%xmm7,%%xmm1\n\t" \
|
||||
/*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
|
||||
"punpckldq %%xmm2,%%xmm7\n\t" \
|
||||
/*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||
"punpckhdq %%xmm2,%%xmm1\n\t" \
|
||||
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \
|
||||
/*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||
"movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*xmm1 is free.*/ \
|
||||
"movdqa %%xmm3,%%xmm1\n\t" \
|
||||
/*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
|
||||
"punpckhdq %%xmm0,%%xmm3\n\t" \
|
||||
/*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
|
||||
"punpckldq %%xmm0,%%xmm1\n\t" \
|
||||
/*xmm0 is free.*/ \
|
||||
"movdqa %%xmm4,%%xmm0\n\t" \
|
||||
/*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
|
||||
"punpckhdq %%xmm6,%%xmm4\n\t" \
|
||||
/*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
|
||||
"punpckldq %%xmm6,%%xmm0\n\t" \
|
||||
/*xmm6 is free.*/ \
|
||||
"movdqa %%xmm5,%%xmm6\n\t" \
|
||||
/*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
|
||||
"punpckldq %%xmm2,%%xmm5\n\t" \
|
||||
/*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
|
||||
"punpckhdq %%xmm2,%%xmm6\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
|
||||
"punpckhqdq %%xmm0,%%xmm1\n\t" \
|
||||
/*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"punpcklqdq %%xmm0,%%xmm2\n\t" \
|
||||
/*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
|
||||
/*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm3,%%xmm2\n\t" \
|
||||
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
|
||||
"punpckhqdq %%xmm4,%%xmm3\n\t" \
|
||||
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
|
||||
"punpcklqdq %%xmm4,%%xmm2\n\t" \
|
||||
/*xmm4 is free.*/ \
|
||||
"movdqa %%xmm5,%%xmm4\n\t" \
|
||||
/*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
|
||||
"punpckhqdq %%xmm7,%%xmm5\n\t" \
|
||||
/*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
|
||||
"punpcklqdq %%xmm7,%%xmm4\n\t" \
|
||||
/*xmm7 is free.*/ \
|
||||
"movdqa %%xmm6,%%xmm7\n\t" \
|
||||
/*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
|
||||
"punpcklqdq %%xmm0,%%xmm6\n\t" \
|
||||
/*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
|
||||
"punpckhqdq %%xmm0,%%xmm7\n\t" \
|
||||
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \
|
||||
|
||||
# endif
|
||||
|
||||
/*Transpose 4 values in each of 8 MMX registers into 8 values in the first
|
||||
four SSE registers.
|
||||
No need to be clever here; we have plenty of room.*/
|
||||
# define OC_TRANSPOSE_8x4_MMX2SSE \
|
||||
"#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
|
||||
"movq2dq %%mm0,%%xmm0\n\t" \
|
||||
"movq2dq %%mm1,%%xmm1\n\t" \
|
||||
/*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
|
||||
"punpcklwd %%xmm1,%%xmm0\n\t" \
|
||||
"movq2dq %%mm2,%%xmm3\n\t" \
|
||||
"movq2dq %%mm3,%%xmm2\n\t" \
|
||||
/*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
|
||||
"punpcklwd %%xmm2,%%xmm3\n\t" \
|
||||
"movq2dq %%mm4,%%xmm4\n\t" \
|
||||
"movq2dq %%mm5,%%xmm5\n\t" \
|
||||
/*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
|
||||
"punpcklwd %%xmm5,%%xmm4\n\t" \
|
||||
"movq2dq %%mm6,%%xmm7\n\t" \
|
||||
"movq2dq %%mm7,%%xmm6\n\t" \
|
||||
/*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
|
||||
"punpcklwd %%xmm6,%%xmm7\n\t" \
|
||||
"movdqa %%xmm0,%%xmm2\n\t" \
|
||||
/*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
|
||||
"punpckldq %%xmm3,%%xmm0\n\t" \
|
||||
/*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
|
||||
"punpckhdq %%xmm3,%%xmm2\n\t" \
|
||||
"movdqa %%xmm4,%%xmm5\n\t" \
|
||||
/*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
|
||||
"punpckldq %%xmm7,%%xmm4\n\t" \
|
||||
/*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
|
||||
"punpckhdq %%xmm7,%%xmm5\n\t" \
|
||||
"movdqa %%xmm0,%%xmm1\n\t" \
|
||||
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"punpcklqdq %%xmm4,%%xmm0\n\t" \
|
||||
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
|
||||
"punpckhqdq %%xmm4,%%xmm1\n\t" \
|
||||
"movdqa %%xmm2,%%xmm3\n\t" \
|
||||
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
|
||||
"punpcklqdq %%xmm5,%%xmm2\n\t" \
|
||||
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
|
||||
"punpckhqdq %%xmm5,%%xmm3\n\t" \
|
||||
|
||||
#endif
|
182
media/libtheora/lib/x86/x86cpu.c
Normal file
182
media/libtheora/lib/x86/x86cpu.c
Normal file
@ -0,0 +1,182 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
CPU capability detection for x86 processors.
|
||||
Originally written by Rudolf Marek.
|
||||
|
||||
function:
|
||||
last mod: $Id: x86cpu.c 17410 2010-09-21 21:53:48Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include "x86cpu.h"
|
||||
|
||||
#if !defined(OC_X86_ASM)
|
||||
ogg_uint32_t oc_cpu_flags_get(void){
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
# if defined(__amd64__)||defined(__x86_64__)
|
||||
/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
|
||||
compiling with -fPIC.*/
|
||||
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
|
||||
__asm__ __volatile__( \
|
||||
"cpuid\n\t" \
|
||||
:[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
|
||||
:"a"(_op) \
|
||||
:"cc" \
|
||||
)
|
||||
# else
|
||||
/*On x86-32, not so much.*/
|
||||
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
|
||||
__asm__ __volatile__( \
|
||||
"xchgl %%ebx,%[ebx]\n\t" \
|
||||
"cpuid\n\t" \
|
||||
"xchgl %%ebx,%[ebx]\n\t" \
|
||||
:[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
|
||||
:"a"(_op) \
|
||||
:"cc" \
|
||||
)
|
||||
# endif
|
||||
|
||||
static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
||||
ogg_uint32_t flags;
|
||||
/*If there isn't even MMX, give up.*/
|
||||
if(!(_edx&0x00800000))return 0;
|
||||
flags=OC_CPU_X86_MMX;
|
||||
if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
|
||||
if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
|
||||
if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
|
||||
if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
|
||||
if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
|
||||
if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
|
||||
return flags;
|
||||
}
|
||||
|
||||
static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
||||
ogg_uint32_t flags;
|
||||
/*If there isn't even MMX, give up.*/
|
||||
if(!(_edx&0x00800000))return 0;
|
||||
flags=OC_CPU_X86_MMX;
|
||||
if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
|
||||
if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
|
||||
if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
|
||||
if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
|
||||
if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
|
||||
return flags;
|
||||
}
|
||||
|
||||
ogg_uint32_t oc_cpu_flags_get(void){
|
||||
ogg_uint32_t flags;
|
||||
ogg_uint32_t eax;
|
||||
ogg_uint32_t ebx;
|
||||
ogg_uint32_t ecx;
|
||||
ogg_uint32_t edx;
|
||||
# if !defined(__amd64__)&&!defined(__x86_64__)
|
||||
/*Not all x86-32 chips support cpuid, so we have to check.*/
|
||||
__asm__ __volatile__(
|
||||
"pushfl\n\t"
|
||||
"pushfl\n\t"
|
||||
"popl %[a]\n\t"
|
||||
"movl %[a],%[b]\n\t"
|
||||
"xorl $0x200000,%[a]\n\t"
|
||||
"pushl %[a]\n\t"
|
||||
"popfl\n\t"
|
||||
"pushfl\n\t"
|
||||
"popl %[a]\n\t"
|
||||
"popfl\n\t"
|
||||
:[a]"=r"(eax),[b]"=r"(ebx)
|
||||
:
|
||||
:"cc"
|
||||
);
|
||||
/*No cpuid.*/
|
||||
if(eax==ebx)return 0;
|
||||
# endif
|
||||
cpuid(0,eax,ebx,ecx,edx);
|
||||
/* l e t n I e n i u n e G*/
|
||||
if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
|
||||
/* 6 8 x M T e n i u n e G*/
|
||||
ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
|
||||
int family;
|
||||
int model;
|
||||
/*Intel, Transmeta (tested with Crusoe TM5800):*/
|
||||
cpuid(1,eax,ebx,ecx,edx);
|
||||
flags=oc_parse_intel_flags(edx,ecx);
|
||||
family=(eax>>8)&0xF;
|
||||
model=(eax>>4)&0xF;
|
||||
/*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
|
||||
unit, so don't use it.*/
|
||||
if(family==6&&(model==9||model==13||model==14)){
|
||||
flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
|
||||
}
|
||||
}
|
||||
/* D M A c i t n e h t u A*/
|
||||
else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
|
||||
/* C S N y b e d o e G*/
|
||||
ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
|
||||
/*AMD, Geode:*/
|
||||
cpuid(0x80000000,eax,ebx,ecx,edx);
|
||||
if(eax<0x80000001)flags=0;
|
||||
else{
|
||||
cpuid(0x80000001,eax,ebx,ecx,edx);
|
||||
flags=oc_parse_amd_flags(edx,ecx);
|
||||
}
|
||||
/*Also check for SSE.*/
|
||||
cpuid(1,eax,ebx,ecx,edx);
|
||||
flags|=oc_parse_intel_flags(edx,ecx);
|
||||
}
|
||||
/*Technically some VIA chips can be configured in the BIOS to return any
|
||||
string here the user wants.
|
||||
There is a special detection method that can be used to identify such
|
||||
processors, but in my opinion, if the user really wants to change it, they
|
||||
deserve what they get.*/
|
||||
/* s l u a H r u a t n e C*/
|
||||
else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
|
||||
/*VIA:*/
|
||||
/*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
|
||||
chips (thanks to the engineers from Centaur Technology who provided it).
|
||||
These chips support Intel-like cpuid info.
|
||||
The C3-2 (Nehemiah) cores appear to, as well.*/
|
||||
cpuid(1,eax,ebx,ecx,edx);
|
||||
flags=oc_parse_intel_flags(edx,ecx);
|
||||
if(eax>=0x80000001){
|
||||
/*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
|
||||
We need to check this even if the Intel test succeeds to pick up 3DNow!
|
||||
support on these processors.
|
||||
Unlike actual AMD processors, we cannot _rely_ on this info, since
|
||||
some cores (e.g., the 693 stepping of the Nehemiah) claim to support
|
||||
this function, yet return edx=0, despite the Intel test indicating
|
||||
MMX support.
|
||||
Therefore the features detected here are strictly added to those
|
||||
detected by the Intel test.*/
|
||||
/*TODO: How about earlier chips?*/
|
||||
cpuid(0x80000001,eax,ebx,ecx,edx);
|
||||
/*Note: As of the C7, this function returns Intel-style extended feature
|
||||
flags, not AMD-style.
|
||||
Currently, this only defines bits 11, 20, and 29 (0x20100800), which
|
||||
do not conflict with any of the AMD flags we inspect.
|
||||
For the remaining bits, Intel tells us, "Do not count on their value",
|
||||
but VIA assures us that they will all be zero (at least on the C7 and
|
||||
Isaiah chips).
|
||||
In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
|
||||
(0xC0C00000) for something else, we will have to add code to detect
|
||||
the model to decide when it is appropriate to inspect them.*/
|
||||
flags|=oc_parse_amd_flags(edx,ecx);
|
||||
}
|
||||
}
|
||||
else{
|
||||
/*Implement me.*/
|
||||
flags=0;
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
#endif
|
@ -10,13 +10,13 @@
|
||||
* *
|
||||
********************************************************************
|
||||
function:
|
||||
last mod: $Id: cpu.h 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: x86cpu.h 17410 2010-09-21 21:53:48Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_cpu_H)
|
||||
# define _x86_cpu_H (1)
|
||||
#include "internal.h"
|
||||
#if !defined(_x86_x86cpu_H)
|
||||
# define _x86_x86cpu_H (1)
|
||||
#include "../internal.h"
|
||||
|
||||
#define OC_CPU_X86_MMX (1<<0)
|
||||
#define OC_CPU_X86_3DNOW (1<<1)
|
||||
@ -31,4 +31,6 @@
|
||||
#define OC_CPU_X86_SSE4A (1<<10)
|
||||
#define OC_CPU_X86_SSE5 (1<<11)
|
||||
|
||||
ogg_uint32_t oc_cpu_flags_get(void);
|
||||
|
||||
#endif
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: x86int.h 17578 2010-10-29 04:21:26Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -19,24 +19,104 @@
|
||||
# define _x86_x86int_H (1)
|
||||
# include "../internal.h"
|
||||
|
||||
void oc_state_vtable_init_x86(oc_theora_state *_state);
|
||||
# if defined(OC_X86_ASM)
|
||||
# define oc_state_accel_init oc_state_accel_init_x86
|
||||
# if defined(OC_X86_64_ASM)
|
||||
/*x86-64 guarantees SIMD support up through at least SSE2.
|
||||
If the best routine we have available only needs SSE2 (which at the moment
|
||||
covers all of them), then we can avoid runtime detection and the indirect
|
||||
call.*/
|
||||
# define oc_frag_copy(_state,_dst,_src,_ystride) \
|
||||
oc_frag_copy_mmx(_dst,_src,_ystride)
|
||||
# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs) \
|
||||
oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs)
|
||||
# define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \
|
||||
oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
|
||||
# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
|
||||
oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
|
||||
# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
|
||||
oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue)
|
||||
# define oc_idct8x8(_state,_y,_x,_last_zzi) \
|
||||
oc_idct8x8_sse2(_y,_x,_last_zzi)
|
||||
# define oc_state_frag_recon oc_state_frag_recon_mmx
|
||||
# define oc_loop_filter_init(_state,_bv,_flimit) \
|
||||
oc_loop_filter_init_mmxext(_bv,_flimit)
|
||||
# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext
|
||||
# define oc_restore_fpu(_state) \
|
||||
oc_restore_fpu_mmx()
|
||||
# else
|
||||
# define OC_STATE_USE_VTABLE (1)
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# include "../state.h"
|
||||
# include "x86cpu.h"
|
||||
|
||||
/*Converts the expression in the argument to a string.*/
|
||||
#define OC_M2STR(_s) #_s
|
||||
|
||||
/*Memory operands do not always include an offset.
|
||||
To avoid warnings, we force an offset with %H (which adds 8).*/
|
||||
# if __GNUC_PREREQ(4,0)
|
||||
# define OC_MEM_OFFS(_offs,_name) \
|
||||
OC_M2STR(_offs-8+%H[_name])
|
||||
# endif
|
||||
/*If your gcc version does't support %H, then you get to suffer the warnings.
|
||||
Note that Apple's gas breaks on things like _offs+(%esp): it throws away the
|
||||
whole offset, instead of substituting in 0 for the missing operand to +.*/
|
||||
# if !defined(OC_MEM_OFFS)
|
||||
# define OC_MEM_OFFS(_offs,_name) \
|
||||
OC_M2STR(_offs+%[_name])
|
||||
# endif
|
||||
|
||||
/*Declare an array operand with an exact size.
|
||||
This tells gcc we're going to clobber this memory region, without having to
|
||||
clobber all of "memory" and lets us access local buffers directly using the
|
||||
stack pointer, without allocating a separate register to point to them.*/
|
||||
#define OC_ARRAY_OPERAND(_type,_ptr,_size) \
|
||||
(*({ \
|
||||
struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \
|
||||
array_addr__; \
|
||||
}))
|
||||
|
||||
/*Declare an array operand with an exact size.
|
||||
This tells gcc we're going to clobber this memory region, without having to
|
||||
clobber all of "memory" and lets us access local buffers directly using the
|
||||
stack pointer, without allocating a separate register to point to them.*/
|
||||
#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \
|
||||
(*({ \
|
||||
const struct{_type array_value__[(_size)];} *array_addr__= \
|
||||
(const void *)(_ptr); \
|
||||
array_addr__; \
|
||||
}))
|
||||
|
||||
extern const unsigned short __attribute__((aligned(16))) OC_IDCT_CONSTS[64];
|
||||
|
||||
void oc_state_accel_init_x86(oc_theora_state *_state);
|
||||
|
||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride);
|
||||
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter_mmx(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
||||
int _dst_frame,int _src_frame,int _pli);
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
|
||||
void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit);
|
||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
void oc_restore_fpu_mmx(void);
|
||||
|
||||
#endif
|
||||
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: x86state.c 17421 2010-09-22 16:46:18Z giles $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -19,8 +19,6 @@
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
#include "../cpu.c"
|
||||
|
||||
/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
|
||||
each quadrant of the destination.*/
|
||||
static const unsigned char OC_FZIG_ZAG_MMX[128]={
|
||||
@ -39,24 +37,59 @@ static const unsigned char OC_FZIG_ZAG_MMX[128]={
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64
|
||||
};
|
||||
|
||||
void oc_state_vtable_init_x86(oc_theora_state *_state){
|
||||
/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
|
||||
the destination.*/
|
||||
static const unsigned char OC_FZIG_ZAG_SSE2[128]={
|
||||
0, 8, 1, 2, 9,16,24,17,
|
||||
10, 3, 4,11,18,25,32,40,
|
||||
33,26,19,12, 5, 6,13,20,
|
||||
27,34,41,48,56,49,42,35,
|
||||
28,21,14, 7,15,22,29,36,
|
||||
43,50,57,58,51,44,37,30,
|
||||
23,31,38,45,52,59,60,53,
|
||||
46,39,47,54,61,62,55,63,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64
|
||||
};
|
||||
|
||||
void oc_state_accel_init_x86(oc_theora_state *_state){
|
||||
oc_state_accel_init_c(_state);
|
||||
_state->cpu_flags=oc_cpu_flags_get();
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
if(_state->cpu_flags&OC_CPU_X86_MMX){
|
||||
_state->opt_vtable.frag_copy=oc_frag_copy_mmx;
|
||||
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
|
||||
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
||||
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
||||
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
|
||||
_state->opt_vtable.idct8x8=oc_idct8x8_mmx;
|
||||
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
|
||||
_state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
|
||||
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
|
||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||
oc_state_loop_filter_frag_rows_mmx;
|
||||
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
|
||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
|
||||
}
|
||||
else oc_state_vtable_init_c(_state);
|
||||
if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
|
||||
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmxext;
|
||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||
oc_state_loop_filter_frag_rows_mmxext;
|
||||
}
|
||||
if(_state->cpu_flags&OC_CPU_X86_SSE2){
|
||||
_state->opt_vtable.idct8x8=oc_idct8x8_sse2;
|
||||
# endif
|
||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2;
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
}
|
||||
# endif
|
||||
}
|
||||
#endif
|
||||
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: mmxfrag.c 16578 2009-09-25 19:50:48Z cristianadam $
|
||||
last mod: $Id: mmxfrag.c 17446 2010-09-23 20:06:20Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -22,10 +22,61 @@
|
||||
The iteration each instruction belongs to is marked in the comments as #i.*/
|
||||
#include <stddef.h>
|
||||
#include "x86int.h"
|
||||
#include "mmxfrag.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||
between rows.*/
|
||||
# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
|
||||
do{ \
|
||||
const unsigned char *src; \
|
||||
unsigned char *dst; \
|
||||
src=(_src); \
|
||||
dst=(_dst); \
|
||||
__asm mov SRC,src \
|
||||
__asm mov DST,dst \
|
||||
__asm mov YSTRIDE,_ystride \
|
||||
/*src+0*ystride*/ \
|
||||
__asm movq mm0,[SRC] \
|
||||
/*src+1*ystride*/ \
|
||||
__asm movq mm1,[SRC+YSTRIDE] \
|
||||
/*ystride3=ystride*3*/ \
|
||||
__asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
|
||||
/*src+2*ystride*/ \
|
||||
__asm movq mm2,[SRC+YSTRIDE*2] \
|
||||
/*src+3*ystride*/ \
|
||||
__asm movq mm3,[SRC+YSTRIDE3] \
|
||||
/*dst+0*ystride*/ \
|
||||
__asm movq [DST],mm0 \
|
||||
/*dst+1*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE],mm1 \
|
||||
/*Pointer to next 4.*/ \
|
||||
__asm lea SRC,[SRC+YSTRIDE*4] \
|
||||
/*dst+2*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE*2],mm2 \
|
||||
/*dst+3*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE3],mm3 \
|
||||
/*Pointer to next 4.*/ \
|
||||
__asm lea DST,[DST+YSTRIDE*4] \
|
||||
/*src+0*ystride*/ \
|
||||
__asm movq mm0,[SRC] \
|
||||
/*src+1*ystride*/ \
|
||||
__asm movq mm1,[SRC+YSTRIDE] \
|
||||
/*src+2*ystride*/ \
|
||||
__asm movq mm2,[SRC+YSTRIDE*2] \
|
||||
/*src+3*ystride*/ \
|
||||
__asm movq mm3,[SRC+YSTRIDE3] \
|
||||
/*dst+0*ystride*/ \
|
||||
__asm movq [DST],mm0 \
|
||||
/*dst+1*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE],mm1 \
|
||||
/*dst+2*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE*2],mm2 \
|
||||
/*dst+3*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE3],mm3 \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||
between rows.*/
|
||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||
@ -41,6 +92,34 @@ void oc_frag_copy_mmx(unsigned char *_dst,
|
||||
#undef YSTRIDE3
|
||||
}
|
||||
|
||||
/*Copies the fragments specified by the lists of fragment indices from one
|
||||
frame to another.
|
||||
_dst_frame: The reference frame to copy to.
|
||||
_src_frame: The reference frame to copy from.
|
||||
_ystride: The row stride of the reference frames.
|
||||
_fragis: A pointer to a list of fragment indices.
|
||||
_nfragis: The number of fragment indices to copy.
|
||||
_frag_buf_offs: The offsets of fragments in the reference frames.*/
|
||||
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
|
||||
ptrdiff_t fragii;
|
||||
for(fragii=0;fragii<_nfragis;fragii++){
|
||||
ptrdiff_t frag_buf_off;
|
||||
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
|
||||
#define SRC edx
|
||||
#define DST eax
|
||||
#define YSTRIDE ecx
|
||||
#define YSTRIDE3 edi
|
||||
OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
|
||||
_src_frame+frag_buf_off,_ystride);
|
||||
#undef SRC
|
||||
#undef DST
|
||||
#undef YSTRIDE
|
||||
#undef YSTRIDE3
|
||||
}
|
||||
}
|
||||
|
||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t *_residue){
|
||||
__asm{
|
||||
|
@ -1,61 +0,0 @@
|
||||
#if !defined(_x86_vc_mmxfrag_H)
|
||||
# define _x86_vc_mmxfrag_H (1)
|
||||
# include <stddef.h>
|
||||
# include "x86int.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||
between rows.*/
|
||||
#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
|
||||
do{ \
|
||||
const unsigned char *src; \
|
||||
unsigned char *dst; \
|
||||
src=(_src); \
|
||||
dst=(_dst); \
|
||||
__asm mov SRC,src \
|
||||
__asm mov DST,dst \
|
||||
__asm mov YSTRIDE,_ystride \
|
||||
/*src+0*ystride*/ \
|
||||
__asm movq mm0,[SRC] \
|
||||
/*src+1*ystride*/ \
|
||||
__asm movq mm1,[SRC+YSTRIDE] \
|
||||
/*ystride3=ystride*3*/ \
|
||||
__asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
|
||||
/*src+2*ystride*/ \
|
||||
__asm movq mm2,[SRC+YSTRIDE*2] \
|
||||
/*src+3*ystride*/ \
|
||||
__asm movq mm3,[SRC+YSTRIDE3] \
|
||||
/*dst+0*ystride*/ \
|
||||
__asm movq [DST],mm0 \
|
||||
/*dst+1*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE],mm1 \
|
||||
/*Pointer to next 4.*/ \
|
||||
__asm lea SRC,[SRC+YSTRIDE*4] \
|
||||
/*dst+2*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE*2],mm2 \
|
||||
/*dst+3*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE3],mm3 \
|
||||
/*Pointer to next 4.*/ \
|
||||
__asm lea DST,[DST+YSTRIDE*4] \
|
||||
/*src+0*ystride*/ \
|
||||
__asm movq mm0,[SRC] \
|
||||
/*src+1*ystride*/ \
|
||||
__asm movq mm1,[SRC+YSTRIDE] \
|
||||
/*src+2*ystride*/ \
|
||||
__asm movq mm2,[SRC+YSTRIDE*2] \
|
||||
/*src+3*ystride*/ \
|
||||
__asm movq mm3,[SRC+YSTRIDE3] \
|
||||
/*dst+0*ystride*/ \
|
||||
__asm movq [DST],mm0 \
|
||||
/*dst+1*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE],mm1 \
|
||||
/*dst+2*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE*2],mm2 \
|
||||
/*dst+3*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE3],mm3 \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
# endif
|
||||
#endif
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -24,15 +24,15 @@
|
||||
|
||||
/*These are offsets into the table of constants below.*/
|
||||
/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
|
||||
#define OC_COSINE_OFFSET (0)
|
||||
#define OC_COSINE_OFFSET (8)
|
||||
/*A row of 8's.*/
|
||||
#define OC_EIGHT_OFFSET (56)
|
||||
#define OC_EIGHT_OFFSET (0)
|
||||
|
||||
|
||||
|
||||
/*A table of constants used by the MMX routines.*/
|
||||
static const __declspec(align(16))ogg_uint16_t
|
||||
OC_IDCT_CONSTS[(7+1)*4]={
|
||||
static const OC_ALIGN16(ogg_uint16_t) OC_IDCT_CONSTS[(1+7)*4]={
|
||||
8, 8, 8, 8,
|
||||
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
|
||||
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
|
||||
(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
|
||||
@ -46,28 +46,27 @@ static const __declspec(align(16))ogg_uint16_t
|
||||
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
|
||||
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
|
||||
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
|
||||
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
|
||||
8, 8, 8, 8
|
||||
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1
|
||||
};
|
||||
|
||||
/*38 cycles*/
|
||||
#define OC_IDCT_BEGIN __asm{ \
|
||||
__asm movq mm2,OC_I(3) \
|
||||
#define OC_IDCT_BEGIN(_y,_x) __asm{ \
|
||||
__asm movq mm2,OC_I(3,_x) \
|
||||
__asm movq mm6,OC_C(3) \
|
||||
__asm movq mm4,mm2 \
|
||||
__asm movq mm7,OC_J(5) \
|
||||
__asm movq mm7,OC_J(5,_x) \
|
||||
__asm pmulhw mm4,mm6 \
|
||||
__asm movq mm1,OC_C(5) \
|
||||
__asm pmulhw mm6,mm7 \
|
||||
__asm movq mm5,mm1 \
|
||||
__asm pmulhw mm1,mm2 \
|
||||
__asm movq mm3,OC_I(1) \
|
||||
__asm movq mm3,OC_I(1,_x) \
|
||||
__asm pmulhw mm5,mm7 \
|
||||
__asm movq mm0,OC_C(1) \
|
||||
__asm paddw mm4,mm2 \
|
||||
__asm paddw mm6,mm7 \
|
||||
__asm paddw mm2,mm1 \
|
||||
__asm movq mm1,OC_J(7) \
|
||||
__asm movq mm1,OC_J(7,_x) \
|
||||
__asm paddw mm7,mm5 \
|
||||
__asm movq mm5,mm0 \
|
||||
__asm pmulhw mm0,mm3 \
|
||||
@ -77,13 +76,13 @@ static const __declspec(align(16))ogg_uint16_t
|
||||
__asm psubw mm6,mm2 \
|
||||
__asm paddw mm0,mm3 \
|
||||
__asm pmulhw mm3,mm7 \
|
||||
__asm movq mm2,OC_I(2) \
|
||||
__asm movq mm2,OC_I(2,_x) \
|
||||
__asm pmulhw mm7,mm1 \
|
||||
__asm paddw mm5,mm1 \
|
||||
__asm movq mm1,mm2 \
|
||||
__asm pmulhw mm2,OC_C(2) \
|
||||
__asm psubw mm3,mm5 \
|
||||
__asm movq mm5,OC_J(6) \
|
||||
__asm movq mm5,OC_J(6,_x) \
|
||||
__asm paddw mm0,mm7 \
|
||||
__asm movq mm7,mm5 \
|
||||
__asm psubw mm0,mm4 \
|
||||
@ -97,18 +96,18 @@ static const __declspec(align(16))ogg_uint16_t
|
||||
__asm paddw mm6,mm6 \
|
||||
__asm pmulhw mm7,OC_C(6) \
|
||||
__asm paddw mm6,mm3 \
|
||||
__asm movq OC_I(1),mm4 \
|
||||
__asm movq OC_I(1,_y),mm4 \
|
||||
__asm psubw mm1,mm5 \
|
||||
__asm movq mm4,OC_C(4) \
|
||||
__asm movq mm5,mm3 \
|
||||
__asm pmulhw mm3,mm4 \
|
||||
__asm paddw mm7,mm2 \
|
||||
__asm movq OC_I(2),mm6 \
|
||||
__asm movq OC_I(2,_y),mm6 \
|
||||
__asm movq mm2,mm0 \
|
||||
__asm movq mm6,OC_I(0) \
|
||||
__asm movq mm6,OC_I(0,_x) \
|
||||
__asm pmulhw mm0,mm4 \
|
||||
__asm paddw mm5,mm3 \
|
||||
__asm movq mm3,OC_J(4) \
|
||||
__asm movq mm3,OC_J(4,_x) \
|
||||
__asm psubw mm5,mm1 \
|
||||
__asm paddw mm2,mm0 \
|
||||
__asm psubw mm6,mm3 \
|
||||
@ -122,17 +121,17 @@ static const __declspec(align(16))ogg_uint16_t
|
||||
__asm paddw mm6,mm0 \
|
||||
__asm psubw mm6,mm2 \
|
||||
__asm paddw mm2,mm2 \
|
||||
__asm movq mm0,OC_I(1) \
|
||||
__asm movq mm0,OC_I(1,_y) \
|
||||
__asm paddw mm2,mm6 \
|
||||
__asm paddw mm4,mm3 \
|
||||
__asm psubw mm2,mm1 \
|
||||
}
|
||||
|
||||
/*38+8=46 cycles.*/
|
||||
#define OC_ROW_IDCT __asm{ \
|
||||
OC_IDCT_BEGIN \
|
||||
#define OC_ROW_IDCT(_y,_x) __asm{ \
|
||||
OC_IDCT_BEGIN(_y,_x) \
|
||||
/*r3=D'*/ \
|
||||
__asm movq mm3,OC_I(2) \
|
||||
__asm movq mm3,OC_I(2,_y) \
|
||||
/*r4=E'=E-G*/ \
|
||||
__asm psubw mm4,mm7 \
|
||||
/*r1=H'+H'*/ \
|
||||
@ -157,7 +156,7 @@ static const __declspec(align(16))ogg_uint16_t
|
||||
__asm psubw mm7,mm0 \
|
||||
__asm paddw mm0,mm0 \
|
||||
/*Save R1.*/ \
|
||||
__asm movq OC_I(1),mm1 \
|
||||
__asm movq OC_I(1,_y),mm1 \
|
||||
/*r0=R0=G.+C.*/ \
|
||||
__asm paddw mm0,mm7 \
|
||||
}
|
||||
@ -190,10 +189,10 @@ static const __declspec(align(16))ogg_uint16_t
|
||||
|
||||
Since r1 is free at entry, we calculate the Js first.*/
|
||||
/*19 cycles.*/
|
||||
#define OC_TRANSPOSE __asm{ \
|
||||
#define OC_TRANSPOSE(_y) __asm{ \
|
||||
__asm movq mm1,mm4 \
|
||||
__asm punpcklwd mm4,mm5 \
|
||||
__asm movq OC_I(0),mm0 \
|
||||
__asm movq OC_I(0,_y),mm0 \
|
||||
__asm punpckhwd mm1,mm5 \
|
||||
__asm movq mm0,mm6 \
|
||||
__asm punpcklwd mm6,mm7 \
|
||||
@ -201,17 +200,17 @@ static const __declspec(align(16))ogg_uint16_t
|
||||
__asm punpckldq mm4,mm6 \
|
||||
__asm punpckhdq mm5,mm6 \
|
||||
__asm movq mm6,mm1 \
|
||||
__asm movq OC_J(4),mm4 \
|
||||
__asm movq OC_J(4,_y),mm4 \
|
||||
__asm punpckhwd mm0,mm7 \
|
||||
__asm movq OC_J(5),mm5 \
|
||||
__asm movq OC_J(5,_y),mm5 \
|
||||
__asm punpckhdq mm6,mm0 \
|
||||
__asm movq mm4,OC_I(0) \
|
||||
__asm movq mm4,OC_I(0,_y) \
|
||||
__asm punpckldq mm1,mm0 \
|
||||
__asm movq mm5,OC_I(1) \
|
||||
__asm movq mm5,OC_I(1,_y) \
|
||||
__asm movq mm0,mm4 \
|
||||
__asm movq OC_J(7),mm6 \
|
||||
__asm movq OC_J(7,_y),mm6 \
|
||||
__asm punpcklwd mm0,mm5 \
|
||||
__asm movq OC_J(6),mm1 \
|
||||
__asm movq OC_J(6,_y),mm1 \
|
||||
__asm punpckhwd mm4,mm5 \
|
||||
__asm movq mm5,mm2 \
|
||||
__asm punpcklwd mm2,mm3 \
|
||||
@ -219,18 +218,18 @@ static const __declspec(align(16))ogg_uint16_t
|
||||
__asm punpckldq mm0,mm2 \
|
||||
__asm punpckhdq mm1,mm2 \
|
||||
__asm movq mm2,mm4 \
|
||||
__asm movq OC_I(0),mm0 \
|
||||
__asm movq OC_I(0,_y),mm0 \
|
||||
__asm punpckhwd mm5,mm3 \
|
||||
__asm movq OC_I(1),mm1 \
|
||||
__asm movq OC_I(1,_y),mm1 \
|
||||
__asm punpckhdq mm4,mm5 \
|
||||
__asm punpckldq mm2,mm5 \
|
||||
__asm movq OC_I(3),mm4 \
|
||||
__asm movq OC_I(2),mm2 \
|
||||
__asm movq OC_I(3,_y),mm4 \
|
||||
__asm movq OC_I(2,_y),mm2 \
|
||||
}
|
||||
|
||||
/*38+19=57 cycles.*/
|
||||
#define OC_COLUMN_IDCT __asm{ \
|
||||
OC_IDCT_BEGIN \
|
||||
#define OC_COLUMN_IDCT(_y) __asm{ \
|
||||
OC_IDCT_BEGIN(_y,_y) \
|
||||
__asm paddw mm2,OC_8 \
|
||||
/*r1=H'+H'*/ \
|
||||
__asm paddw mm1,mm1 \
|
||||
@ -243,15 +242,15 @@ static const __declspec(align(16))ogg_uint16_t
|
||||
/*r1=NR1*/ \
|
||||
__asm psraw mm1,4 \
|
||||
/*r3=D'*/ \
|
||||
__asm movq mm3,OC_I(2) \
|
||||
__asm movq mm3,OC_I(2,_y) \
|
||||
/*r7=G+G*/ \
|
||||
__asm paddw mm7,mm7 \
|
||||
/*Store NR2 at I(2).*/ \
|
||||
__asm movq OC_I(2),mm2 \
|
||||
__asm movq OC_I(2,_y),mm2 \
|
||||
/*r7=G'=E+G*/ \
|
||||
__asm paddw mm7,mm4 \
|
||||
/*Store NR1 at I(1).*/ \
|
||||
__asm movq OC_I(1),mm1 \
|
||||
__asm movq OC_I(1,_y),mm1 \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
__asm psubw mm4,mm3 \
|
||||
__asm paddw mm4,OC_8 \
|
||||
@ -273,11 +272,11 @@ static const __declspec(align(16))ogg_uint16_t
|
||||
/*r6=NR6*/ \
|
||||
__asm psraw mm6,4 \
|
||||
/*Store NR4 at J(4).*/ \
|
||||
__asm movq OC_J(4),mm4 \
|
||||
__asm movq OC_J(4,_y),mm4 \
|
||||
/*r5=NR5*/ \
|
||||
__asm psraw mm5,4 \
|
||||
/*Store NR3 at I(3).*/ \
|
||||
__asm movq OC_I(3),mm3 \
|
||||
__asm movq OC_I(3,_y),mm3 \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
__asm psubw mm7,mm0 \
|
||||
__asm paddw mm7,OC_8 \
|
||||
@ -288,71 +287,92 @@ static const __declspec(align(16))ogg_uint16_t
|
||||
/*r7=NR7*/ \
|
||||
__asm psraw mm7,4 \
|
||||
/*Store NR6 at J(6).*/ \
|
||||
__asm movq OC_J(6),mm6 \
|
||||
__asm movq OC_J(6,_y),mm6 \
|
||||
/*r0=NR0*/ \
|
||||
__asm psraw mm0,4 \
|
||||
/*Store NR5 at J(5).*/ \
|
||||
__asm movq OC_J(5),mm5 \
|
||||
__asm movq OC_J(5,_y),mm5 \
|
||||
/*Store NR7 at J(7).*/ \
|
||||
__asm movq OC_J(7),mm7 \
|
||||
__asm movq OC_J(7,_y),mm7 \
|
||||
/*Store NR0 at I(0).*/ \
|
||||
__asm movq OC_I(0),mm0 \
|
||||
__asm movq OC_I(0,_y),mm0 \
|
||||
}
|
||||
|
||||
#define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]
|
||||
#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1)
|
||||
#define OC_8 OC_MID(OC_EIGHT_OFFSET,0)
|
||||
|
||||
static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
||||
static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
int i;
|
||||
/*This routine accepts an 8x8 matrix, but in partially transposed form.
|
||||
Every 4x4 block is transposed.*/
|
||||
__asm{
|
||||
#define CONSTS eax
|
||||
#define Y edx
|
||||
#define X ecx
|
||||
mov CONSTS,offset OC_IDCT_CONSTS
|
||||
mov Y,_y
|
||||
#define OC_I(_k) [Y+_k*16]
|
||||
#define OC_J(_k) [Y+(_k-4)*16+8]
|
||||
OC_ROW_IDCT
|
||||
OC_TRANSPOSE
|
||||
mov X,_x
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16]
|
||||
#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]
|
||||
OC_ROW_IDCT(Y,X)
|
||||
OC_TRANSPOSE(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k) [Y+(_k*16)+64]
|
||||
#define OC_J(_k) [Y+(_k-4)*16+72]
|
||||
OC_ROW_IDCT
|
||||
OC_TRANSPOSE
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16+64]
|
||||
#define OC_J(_k,_y) [(_y)+((_k)-4)*16+72]
|
||||
OC_ROW_IDCT(Y,X)
|
||||
OC_TRANSPOSE(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k) [Y+_k*16]
|
||||
#define OC_J(_k) OC_I(_k)
|
||||
OC_COLUMN_IDCT
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16]
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k) [Y+_k*16+8]
|
||||
#define OC_J(_k) OC_I(_k)
|
||||
OC_COLUMN_IDCT
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16+8]
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#undef CONSTS
|
||||
#undef Y
|
||||
#undef X
|
||||
}
|
||||
if(_x!=_y){
|
||||
int i;
|
||||
__asm pxor mm0,mm0;
|
||||
for(i=0;i<4;i++){
|
||||
ogg_int16_t *x;
|
||||
x=_x+16*i;
|
||||
#define X ecx
|
||||
__asm{
|
||||
mov X,x
|
||||
movq [X+0x00],mm0
|
||||
movq [X+0x08],mm0
|
||||
movq [X+0x10],mm0
|
||||
movq [X+0x18],mm0
|
||||
}
|
||||
#undef X
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*25 cycles.*/
|
||||
#define OC_IDCT_BEGIN_10 __asm{ \
|
||||
__asm movq mm2,OC_I(3) \
|
||||
#define OC_IDCT_BEGIN_10(_y,_x) __asm{ \
|
||||
__asm movq mm2,OC_I(3,_x) \
|
||||
__asm nop \
|
||||
__asm movq mm6,OC_C(3) \
|
||||
__asm movq mm4,mm2 \
|
||||
__asm movq mm1,OC_C(5) \
|
||||
__asm pmulhw mm4,mm6 \
|
||||
__asm movq mm3,OC_I(1) \
|
||||
__asm movq mm3,OC_I(1,_x) \
|
||||
__asm pmulhw mm1,mm2 \
|
||||
__asm movq mm0,OC_C(1) \
|
||||
__asm paddw mm4,mm2 \
|
||||
__asm pxor mm6,mm6 \
|
||||
__asm paddw mm2,mm1 \
|
||||
__asm movq mm5,OC_I(2) \
|
||||
__asm movq mm5,OC_I(2,_x) \
|
||||
__asm pmulhw mm0,mm3 \
|
||||
__asm movq mm1,mm5 \
|
||||
__asm paddw mm0,mm3 \
|
||||
@ -360,43 +380,43 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
||||
__asm psubw mm6,mm2 \
|
||||
__asm pmulhw mm5,OC_C(2) \
|
||||
__asm psubw mm0,mm4 \
|
||||
__asm movq mm7,OC_I(2) \
|
||||
__asm movq mm7,OC_I(2,_x) \
|
||||
__asm paddw mm4,mm4 \
|
||||
__asm paddw mm7,mm5 \
|
||||
__asm paddw mm4,mm0 \
|
||||
__asm pmulhw mm1,OC_C(6) \
|
||||
__asm psubw mm3,mm6 \
|
||||
__asm movq OC_I(1),mm4 \
|
||||
__asm movq OC_I(1,_y),mm4 \
|
||||
__asm paddw mm6,mm6 \
|
||||
__asm movq mm4,OC_C(4) \
|
||||
__asm paddw mm6,mm3 \
|
||||
__asm movq mm5,mm3 \
|
||||
__asm pmulhw mm3,mm4 \
|
||||
__asm movq OC_I(2),mm6 \
|
||||
__asm movq OC_I(2,_y),mm6 \
|
||||
__asm movq mm2,mm0 \
|
||||
__asm movq mm6,OC_I(0) \
|
||||
__asm movq mm6,OC_I(0,_x) \
|
||||
__asm pmulhw mm0,mm4 \
|
||||
__asm paddw mm5,mm3 \
|
||||
__asm paddw mm2,mm0 \
|
||||
__asm psubw mm5,mm1 \
|
||||
__asm pmulhw mm6,mm4 \
|
||||
__asm paddw mm6,OC_I(0) \
|
||||
__asm paddw mm6,OC_I(0,_x) \
|
||||
__asm paddw mm1,mm1 \
|
||||
__asm movq mm4,mm6 \
|
||||
__asm paddw mm1,mm5 \
|
||||
__asm psubw mm6,mm2 \
|
||||
__asm paddw mm2,mm2 \
|
||||
__asm movq mm0,OC_I(1) \
|
||||
__asm movq mm0,OC_I(1,_y) \
|
||||
__asm paddw mm2,mm6 \
|
||||
__asm psubw mm2,mm1 \
|
||||
__asm nop \
|
||||
}
|
||||
|
||||
/*25+8=33 cycles.*/
|
||||
#define OC_ROW_IDCT_10 __asm{ \
|
||||
OC_IDCT_BEGIN_10 \
|
||||
#define OC_ROW_IDCT_10(_y,_x) __asm{ \
|
||||
OC_IDCT_BEGIN_10(_y,_x) \
|
||||
/*r3=D'*/ \
|
||||
__asm movq mm3,OC_I(2) \
|
||||
__asm movq mm3,OC_I(2,_y) \
|
||||
/*r4=E'=E-G*/ \
|
||||
__asm psubw mm4,mm7 \
|
||||
/*r1=H'+H'*/ \
|
||||
@ -421,14 +441,14 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
||||
__asm psubw mm7,mm0 \
|
||||
__asm paddw mm0,mm0 \
|
||||
/*Save R1.*/ \
|
||||
__asm movq OC_I(1),mm1 \
|
||||
__asm movq OC_I(1,_y),mm1 \
|
||||
/*r0=R0=G'+C'*/ \
|
||||
__asm paddw mm0,mm7 \
|
||||
}
|
||||
|
||||
/*25+19=44 cycles'*/
|
||||
#define OC_COLUMN_IDCT_10 __asm{ \
|
||||
OC_IDCT_BEGIN_10 \
|
||||
#define OC_COLUMN_IDCT_10(_y) __asm{ \
|
||||
OC_IDCT_BEGIN_10(_y,_y) \
|
||||
__asm paddw mm2,OC_8 \
|
||||
/*r1=H'+H'*/ \
|
||||
__asm paddw mm1,mm1 \
|
||||
@ -441,15 +461,15 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
||||
/*r1=NR1*/ \
|
||||
__asm psraw mm1,4 \
|
||||
/*r3=D'*/ \
|
||||
__asm movq mm3,OC_I(2) \
|
||||
__asm movq mm3,OC_I(2,_y) \
|
||||
/*r7=G+G*/ \
|
||||
__asm paddw mm7,mm7 \
|
||||
/*Store NR2 at I(2).*/ \
|
||||
__asm movq OC_I(2),mm2 \
|
||||
__asm movq OC_I(2,_y),mm2 \
|
||||
/*r7=G'=E+G*/ \
|
||||
__asm paddw mm7,mm4 \
|
||||
/*Store NR1 at I(1).*/ \
|
||||
__asm movq OC_I(1),mm1 \
|
||||
__asm movq OC_I(1,_y),mm1 \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
__asm psubw mm4,mm3 \
|
||||
__asm paddw mm4,OC_8 \
|
||||
@ -471,11 +491,11 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
||||
/*r6=NR6*/ \
|
||||
__asm psraw mm6,4 \
|
||||
/*Store NR4 at J(4).*/ \
|
||||
__asm movq OC_J(4),mm4 \
|
||||
__asm movq OC_J(4,_y),mm4 \
|
||||
/*r5=NR5*/ \
|
||||
__asm psraw mm5,4 \
|
||||
/*Store NR3 at I(3).*/ \
|
||||
__asm movq OC_I(3),mm3 \
|
||||
__asm movq OC_I(3,_y),mm3 \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
__asm psubw mm7,mm0 \
|
||||
__asm paddw mm7,OC_8 \
|
||||
@ -486,50 +506,65 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
||||
/*r7=NR7*/ \
|
||||
__asm psraw mm7,4 \
|
||||
/*Store NR6 at J(6).*/ \
|
||||
__asm movq OC_J(6),mm6 \
|
||||
__asm movq OC_J(6,_y),mm6 \
|
||||
/*r0=NR0*/ \
|
||||
__asm psraw mm0,4 \
|
||||
/*Store NR5 at J(5).*/ \
|
||||
__asm movq OC_J(5),mm5 \
|
||||
__asm movq OC_J(5,_y),mm5 \
|
||||
/*Store NR7 at J(7).*/ \
|
||||
__asm movq OC_J(7),mm7 \
|
||||
__asm movq OC_J(7,_y),mm7 \
|
||||
/*Store NR0 at I(0).*/ \
|
||||
__asm movq OC_I(0),mm0 \
|
||||
__asm movq OC_I(0,_y),mm0 \
|
||||
}
|
||||
|
||||
static void oc_idct8x8_10(ogg_int16_t _y[64]){
|
||||
static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
__asm{
|
||||
#define CONSTS eax
|
||||
#define Y edx
|
||||
#define X ecx
|
||||
mov CONSTS,offset OC_IDCT_CONSTS
|
||||
mov Y,_y
|
||||
#define OC_I(_k) [Y+_k*16]
|
||||
#define OC_J(_k) [Y+(_k-4)*16+8]
|
||||
mov X,_x
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16]
|
||||
#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]
|
||||
/*Done with dequant, descramble, and partial transpose.
|
||||
Now do the iDCT itself.*/
|
||||
OC_ROW_IDCT_10
|
||||
OC_TRANSPOSE
|
||||
OC_ROW_IDCT_10(Y,X)
|
||||
OC_TRANSPOSE(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k) [Y+_k*16]
|
||||
#define OC_J(_k) OC_I(_k)
|
||||
OC_COLUMN_IDCT_10
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16]
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT_10(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k) [Y+_k*16+8]
|
||||
#define OC_J(_k) OC_I(_k)
|
||||
OC_COLUMN_IDCT_10
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16+8]
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT_10(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#undef CONSTS
|
||||
#undef Y
|
||||
#undef X
|
||||
}
|
||||
if(_x!=_y){
|
||||
#define X ecx
|
||||
__asm{
|
||||
pxor mm0,mm0;
|
||||
mov X,_x
|
||||
movq [X+0x00],mm0
|
||||
movq [X+0x10],mm0
|
||||
movq [X+0x20],mm0
|
||||
movq [X+0x30],mm0
|
||||
}
|
||||
#undef X
|
||||
}
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.*/
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||
/*_last_zzi is subtly different from an actual count of the number of
|
||||
coefficients we decoded for this block.
|
||||
It contains the value of zzi BEFORE the final token in the block was
|
||||
@ -555,8 +590,8 @@ void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
|
||||
gets.
|
||||
Needless to say we inherited this approach from VP3.*/
|
||||
/*Perform the iDCT.*/
|
||||
if(_last_zzi<10)oc_idct8x8_10(_y);
|
||||
else oc_idct8x8_slow(_y);
|
||||
if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
|
||||
else oc_idct8x8_slow(_y,_x);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: mmxstate.c 16584 2009-09-26 19:35:55Z tterribe $
|
||||
last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -19,17 +19,16 @@
|
||||
Originally written by Rudolf Marek.*/
|
||||
#include <string.h>
|
||||
#include "x86int.h"
|
||||
#include "mmxfrag.h"
|
||||
#include "mmxloop.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||
unsigned char *dst;
|
||||
ptrdiff_t frag_buf_off;
|
||||
int ystride;
|
||||
int mb_mode;
|
||||
int refi;
|
||||
/*Apply the inverse transform.*/
|
||||
/*Special case only having a DC component.*/
|
||||
if(_last_zzi<2){
|
||||
@ -45,6 +44,7 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
#define P ecx
|
||||
mov Y,_dct_coeffs
|
||||
movzx P,p
|
||||
lea Y,[Y+128]
|
||||
/*mm0=0000 0000 0000 AAAA*/
|
||||
movd mm0,P
|
||||
/*mm0=0000 0000 AAAA AAAA*/
|
||||
@ -74,65 +74,32 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
else{
|
||||
/*Dequantize the DC coefficient.*/
|
||||
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
||||
oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
|
||||
oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi);
|
||||
}
|
||||
/*Fill in the target buffer.*/
|
||||
frag_buf_off=_state->frag_buf_offs[_fragi];
|
||||
mb_mode=_state->frags[_fragi].mb_mode;
|
||||
refi=_state->frags[_fragi].refi;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
|
||||
if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
|
||||
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
|
||||
if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
|
||||
else{
|
||||
const unsigned char *ref;
|
||||
int mvoffsets[2];
|
||||
ref=
|
||||
_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
|
||||
+frag_buf_off;
|
||||
ref=_state->ref_frame_data[refi]+frag_buf_off;
|
||||
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
||||
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
|
||||
_state->frag_mvs[_fragi])>1){
|
||||
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
|
||||
_dct_coeffs);
|
||||
_dct_coeffs+64);
|
||||
}
|
||||
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
|
||||
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
|
||||
}
|
||||
}
|
||||
|
||||
/*We copy these entire function to inline the actual MMX routines so that we
|
||||
use only a single indirect call.*/
|
||||
|
||||
/*Copies the fragments specified by the lists of fragment indices from one
|
||||
frame to another.
|
||||
_fragis: A pointer to a list of fragment indices.
|
||||
_nfragis: The number of fragment indices to copy.
|
||||
_dst_frame: The reference frame to copy to.
|
||||
_src_frame: The reference frame to copy from.
|
||||
_pli: The color plane the fragments lie in.*/
|
||||
void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
||||
int _dst_frame,int _src_frame,int _pli){
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
const unsigned char *src_frame_data;
|
||||
unsigned char *dst_frame_data;
|
||||
ptrdiff_t fragii;
|
||||
int ystride;
|
||||
dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
|
||||
src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
frag_buf_offs=_state->frag_buf_offs;
|
||||
for(fragii=0;fragii<_nfragis;fragii++){
|
||||
ptrdiff_t frag_buf_off;
|
||||
frag_buf_off=frag_buf_offs[_fragis[fragii]];
|
||||
#define SRC edx
|
||||
#define DST eax
|
||||
#define YSTRIDE ecx
|
||||
#define YSTRIDE3 edi
|
||||
OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
|
||||
src_frame_data+frag_buf_off,ystride);
|
||||
#undef SRC
|
||||
#undef DST
|
||||
#undef YSTRIDE
|
||||
#undef YSTRIDE3
|
||||
}
|
||||
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
|
||||
memset(_bv,~(_flimit<<1),8);
|
||||
}
|
||||
|
||||
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
||||
@ -144,8 +111,7 @@ void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
|
||||
_fragy0: The Y coordinate of the first fragment row to filter.
|
||||
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||
OC_ALIGN8(unsigned char ll[8]);
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||
const oc_fragment_plane *fplane;
|
||||
const oc_fragment *frags;
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
@ -156,13 +122,12 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||
ptrdiff_t fragi0_end;
|
||||
int ystride;
|
||||
int nhfrags;
|
||||
memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
|
||||
fplane=_state->fplanes+_pli;
|
||||
nhfrags=fplane->nhfrags;
|
||||
fragi_top=fplane->froffset;
|
||||
fragi_bot=fragi_top+fplane->nfrags;
|
||||
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
|
||||
fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
|
||||
fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
frags=_state->frags;
|
||||
frag_buf_offs=_state->frag_buf_offs;
|
||||
@ -187,13 +152,13 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||
#define LL edx
|
||||
#define D esi
|
||||
#define D_WORD si
|
||||
if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
|
||||
if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
|
||||
if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv);
|
||||
if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv);
|
||||
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
|
||||
OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
|
||||
OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv);
|
||||
}
|
||||
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
|
||||
OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
|
||||
OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,_bv);
|
||||
}
|
||||
#undef PIX
|
||||
#undef YSTRIDE3
|
||||
|
@ -14,41 +14,17 @@
|
||||
Originally written by Rudolf Marek.
|
||||
|
||||
function:
|
||||
last mod: $Id: cpu.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: x86cpu.c 17410 2010-09-21 21:53:48Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include "cpu.h"
|
||||
#include "x86cpu.h"
|
||||
|
||||
#if !defined(OC_X86_ASM)
|
||||
static ogg_uint32_t oc_cpu_flags_get(void){
|
||||
ogg_uint32_t oc_cpu_flags_get(void){
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
# if !defined(_MSC_VER)
|
||||
# if defined(__amd64__)||defined(__x86_64__)
|
||||
/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
|
||||
compiling with -fPIC.*/
|
||||
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
|
||||
__asm__ __volatile__( \
|
||||
"cpuid\n\t" \
|
||||
:[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
|
||||
:"a"(_op) \
|
||||
:"cc" \
|
||||
)
|
||||
# else
|
||||
/*On x86-32, not so much.*/
|
||||
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
|
||||
__asm__ __volatile__( \
|
||||
"xchgl %%ebx,%[ebx]\n\t" \
|
||||
"cpuid\n\t" \
|
||||
"xchgl %%ebx,%[ebx]\n\t" \
|
||||
:[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
|
||||
:"a"(_op) \
|
||||
:"cc" \
|
||||
)
|
||||
# endif
|
||||
# else
|
||||
/*Why does MSVC need this complicated rigamarole?
|
||||
At this point I honestly do not care.*/
|
||||
|
||||
@ -95,7 +71,6 @@ static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
|
||||
mov [ecx],ebx
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
||||
ogg_uint32_t flags;
|
||||
@ -124,7 +99,7 @@ static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
||||
return flags;
|
||||
}
|
||||
|
||||
static ogg_uint32_t oc_cpu_flags_get(void){
|
||||
ogg_uint32_t oc_cpu_flags_get(void){
|
||||
ogg_uint32_t flags;
|
||||
ogg_uint32_t eax;
|
||||
ogg_uint32_t ebx;
|
||||
@ -132,25 +107,7 @@ static ogg_uint32_t oc_cpu_flags_get(void){
|
||||
ogg_uint32_t edx;
|
||||
# if !defined(__amd64__)&&!defined(__x86_64__)
|
||||
/*Not all x86-32 chips support cpuid, so we have to check.*/
|
||||
# if !defined(_MSC_VER)
|
||||
__asm__ __volatile__(
|
||||
"pushfl\n\t"
|
||||
"pushfl\n\t"
|
||||
"popl %[a]\n\t"
|
||||
"movl %[a],%[b]\n\t"
|
||||
"xorl $0x200000,%[a]\n\t"
|
||||
"pushl %[a]\n\t"
|
||||
"popfl\n\t"
|
||||
"pushfl\n\t"
|
||||
"popl %[a]\n\t"
|
||||
"popfl\n\t"
|
||||
:[a]"=r"(eax),[b]"=r"(ebx)
|
||||
:
|
||||
:"cc"
|
||||
);
|
||||
# else
|
||||
oc_detect_cpuid_helper(&eax,&ebx);
|
||||
# endif
|
||||
/*No cpuid.*/
|
||||
if(eax==ebx)return 0;
|
||||
# endif
|
||||
@ -159,9 +116,18 @@ static ogg_uint32_t oc_cpu_flags_get(void){
|
||||
if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
|
||||
/* 6 8 x M T e n i u n e G*/
|
||||
ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
|
||||
int family;
|
||||
int model;
|
||||
/*Intel, Transmeta (tested with Crusoe TM5800):*/
|
||||
cpuid(1,eax,ebx,ecx,edx);
|
||||
flags=oc_parse_intel_flags(edx,ecx);
|
||||
family=(eax>>8)&0xF;
|
||||
model=(eax>>4)&0xF;
|
||||
/*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
|
||||
unit, so don't use it.*/
|
||||
if(family==6&&(model==9||model==13||model==14)){
|
||||
flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
|
||||
}
|
||||
}
|
||||
/* D M A c i t n e h t u A*/
|
||||
else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
|
36
media/libtheora/lib/x86_vc/x86cpu.h
Normal file
36
media/libtheora/lib/x86_vc/x86cpu.h
Normal file
@ -0,0 +1,36 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
function:
|
||||
last mod: $Id: x86cpu.h 17410 2010-09-21 21:53:48Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_vc_x86cpu_H)
|
||||
# define _x86_vc_x86cpu_H (1)
|
||||
#include "../internal.h"
|
||||
|
||||
#define OC_CPU_X86_MMX (1<<0)
|
||||
#define OC_CPU_X86_3DNOW (1<<1)
|
||||
#define OC_CPU_X86_3DNOWEXT (1<<2)
|
||||
#define OC_CPU_X86_MMXEXT (1<<3)
|
||||
#define OC_CPU_X86_SSE (1<<4)
|
||||
#define OC_CPU_X86_SSE2 (1<<5)
|
||||
#define OC_CPU_X86_PNI (1<<6)
|
||||
#define OC_CPU_X86_SSSE3 (1<<7)
|
||||
#define OC_CPU_X86_SSE4_1 (1<<8)
|
||||
#define OC_CPU_X86_SSE4_2 (1<<9)
|
||||
#define OC_CPU_X86_SSE4A (1<<10)
|
||||
#define OC_CPU_X86_SSE5 (1<<11)
|
||||
|
||||
ogg_uint32_t oc_cpu_flags_get(void);
|
||||
|
||||
#endif
|
@ -11,32 +11,39 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: x86int.h 17410 2010-09-21 21:53:48Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_vc_x86int_H)
|
||||
# define _x86_vc_x86int_H (1)
|
||||
# include "../internal.h"
|
||||
# if defined(OC_X86_ASM)
|
||||
# define oc_state_accel_init oc_state_accel_init_x86
|
||||
# define OC_STATE_USE_VTABLE (1)
|
||||
# endif
|
||||
# include "../state.h"
|
||||
# include "x86cpu.h"
|
||||
|
||||
void oc_state_vtable_init_x86(oc_theora_state *_state);
|
||||
void oc_state_accel_init_x86(oc_theora_state *_state);
|
||||
|
||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride);
|
||||
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter_mmx(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
||||
int _dst_frame,int _src_frame,int _pli);
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
|
||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
void oc_restore_fpu_mmx(void);
|
||||
|
||||
#endif
|
||||
|
@ -11,7 +11,7 @@
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
|
||||
last mod: $Id: x86state.c 17410 2010-09-21 21:53:48Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
@ -19,8 +19,6 @@
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
#include "../cpu.c"
|
||||
|
||||
/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
|
||||
each quadrant of the destination.*/
|
||||
static const unsigned char OC_FZIG_ZAG_MMX[128]={
|
||||
@ -42,21 +40,22 @@ static const unsigned char OC_FZIG_ZAG_MMX[128]={
|
||||
64,64,64,64,64,64,64,64,
|
||||
};
|
||||
|
||||
void oc_state_vtable_init_x86(oc_theora_state *_state){
|
||||
void oc_state_accel_init_x86(oc_theora_state *_state){
|
||||
_state->cpu_flags=oc_cpu_flags_get();
|
||||
if(_state->cpu_flags&OC_CPU_X86_MMX){
|
||||
_state->opt_vtable.frag_copy=oc_frag_copy_mmx;
|
||||
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
|
||||
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
||||
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
||||
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
|
||||
_state->opt_vtable.idct8x8=oc_idct8x8_mmx;
|
||||
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
|
||||
_state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
|
||||
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
|
||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||
oc_state_loop_filter_frag_rows_mmx;
|
||||
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
|
||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
|
||||
}
|
||||
else oc_state_vtable_init_c(_state);
|
||||
else oc_state_accel_init_c(_state);
|
||||
}
|
||||
#endif
|
||||
|
45
media/libtheora/update.sh
Normal file → Executable file
45
media/libtheora/update.sh
Normal file → Executable file
@ -2,11 +2,20 @@
|
||||
#
|
||||
# Copies the needed files from a directory containing the original
|
||||
# libtheora source that we need for the Mozilla HTML5 media support.
|
||||
sed s/\#define\ OC_X86_ASM//g $1/config.h >./lib/config.h
|
||||
sed s/\#define\ USE_ASM//g ./lib/config.h >./lib/config.h2
|
||||
sed s/\#define\ THEORA_DISABLE_ENCODE//g ./lib/config.h2 >./lib/config.h
|
||||
rm ./lib/config.h2
|
||||
cp ./lib/config.h ./include/theora/config.h
|
||||
sed \
|
||||
-e s/\#define\ OC_X86_ASM//g \
|
||||
-e s/\#define\ OC_X86_64_ASM//g \
|
||||
-e s/\#define\ OC_ARM_ASM_EDSP\ 1//g \
|
||||
-e s/\#define\ OC_ARM_ASM_MEDIA\ 1//g \
|
||||
-e s/\#define\ OC_ARM_ASM_NEON\ 1//g \
|
||||
-e s/\#define\ OC_ARM_ASM//g \
|
||||
-e s/\#define\ THEORA_DISABLE_ENCODE//g \
|
||||
$1/config.h > lib/config.h
|
||||
sed \
|
||||
-e s/@HAVE_ARM_ASM_EDSP@/1/g \
|
||||
-e s/@HAVE_ARM_ASM_MEDIA@/1/g \
|
||||
-e s/@HAVE_ARM_ASM_NEON@/1/g \
|
||||
$1/lib/arm/armopts.s.in > lib/arm/armopts.s
|
||||
cp $1/LICENSE ./LICENSE
|
||||
cp $1/CHANGES ./CHANGES
|
||||
cp $1/COPYING ./COPYING
|
||||
@ -16,8 +25,6 @@ cp $1/lib/apiwrapper.c ./lib/
|
||||
cp $1/lib/apiwrapper.h ./lib/
|
||||
cp $1/lib/bitpack.c ./lib/
|
||||
cp $1/lib/bitpack.h ./lib/
|
||||
cp $1/lib/cpu.c ./lib/
|
||||
cp $1/lib/cpu.h ./lib/
|
||||
cp $1/lib/dct.h ./lib/
|
||||
cp $1/lib/decapiwrapper.c ./lib/
|
||||
cp $1/lib/decinfo.c ./lib/
|
||||
@ -25,13 +32,9 @@ cp $1/lib/decint.h ./lib/
|
||||
cp $1/lib/decode.c ./lib/
|
||||
cp $1/lib/dequant.c ./lib/
|
||||
cp $1/lib/dequant.h ./lib/
|
||||
cp $1/lib/encint.h ./lib/
|
||||
cp $1/lib/encoder_disabled.c ./lib/
|
||||
cp $1/lib/enquant.h ./lib/
|
||||
cp $1/lib/fragment.c ./lib/
|
||||
cp $1/lib/huffdec.c ./lib/
|
||||
cp $1/lib/huffdec.h ./lib/
|
||||
cp $1/lib/huffenc.h ./lib/
|
||||
cp $1/lib/huffman.h ./lib/
|
||||
cp $1/lib/idct.c ./lib/
|
||||
cp $1/lib/info.c ./lib/
|
||||
@ -42,22 +45,36 @@ cp $1/lib/ocintrin.h ./lib/
|
||||
cp $1/lib/quant.c ./lib/
|
||||
cp $1/lib/quant.h ./lib/
|
||||
cp $1/lib/state.c ./lib/
|
||||
cp $1/lib/state.h ./lib/
|
||||
cp $1/lib/arm/arm2gnu.pl ./lib/arm/
|
||||
cp $1/lib/arm/armbits.h ./lib/arm/
|
||||
cp $1/lib/arm/armbits.s ./lib/arm/
|
||||
cp $1/lib/arm/armcpu.c ./lib/arm/
|
||||
cp $1/lib/arm/armcpu.h ./lib/arm/
|
||||
cp $1/lib/arm/armfrag.s ./lib/arm/
|
||||
cp $1/lib/arm/armidct.s ./lib/arm/
|
||||
cp $1/lib/arm/armint.h ./lib/arm/
|
||||
cp $1/lib/arm/armloop.s ./lib/arm/
|
||||
cp $1/lib/arm/armstate.c ./lib/arm/
|
||||
cp $1/lib/x86/mmxfrag.c ./lib/x86/
|
||||
cp $1/lib/x86/mmxfrag.h ./lib/x86/
|
||||
cp $1/lib/x86/mmxidct.c ./lib/x86/
|
||||
cp $1/lib/x86/mmxloop.h ./lib/x86/
|
||||
cp $1/lib/x86/mmxstate.c ./lib/x86/
|
||||
cp $1/lib/x86/sse2idct.c ./lib/x86/
|
||||
cp $1/lib/x86/sse2trans.h ./lib/x86/
|
||||
cp $1/lib/x86/x86cpu.c ./lib/x86/
|
||||
cp $1/lib/x86/x86cpu.h ./lib/x86/
|
||||
cp $1/lib/x86/x86int.h ./lib/x86/
|
||||
cp $1/lib/x86/x86state.c ./lib/x86/
|
||||
cp $1/lib/x86_vc/mmxfrag.c ./lib/x86_vc/
|
||||
cp $1/lib/x86_vc/mmxfrag.h ./lib/x86_vc/
|
||||
cp $1/lib/x86_vc/mmxidct.c ./lib/x86_vc/
|
||||
cp $1/lib/x86_vc/mmxloop.h ./lib/x86_vc/
|
||||
cp $1/lib/x86_vc/mmxstate.c ./lib/x86_vc/
|
||||
cp $1/lib/x86_vc/x86cpu.c ./lib/x86_vc/
|
||||
cp $1/lib/x86_vc/x86cpu.h ./lib/x86_vc/
|
||||
cp $1/lib/x86_vc/x86int.h ./lib/x86_vc/
|
||||
cp $1/lib/x86_vc/x86state.c ./lib/x86_vc/
|
||||
cp $1/include/theora/theora.h ./include/theora/theora.h
|
||||
cp $1/include/theora/theoradec.h ./include/theora/theoradec.h
|
||||
cp $1/include/theora/theoraenc.h ./include/theora/theoraenc.h
|
||||
cp $1/include/theora/codec.h ./include/theora/codec.h
|
||||
patch -p3 <bug559343.patch
|
||||
|
Loading…
Reference in New Issue
Block a user